LPDM Analysis

Baseline characteristics

*** Characteristics at Month 2

baseline <- read_xlsx("../Raw_data/CTX_case_control_variables_for_James10082023.xlsx")

baseline_Table1 <- baseline %>% 
  dplyr::select(subjid, agemonth, sex, site, case, arm, pneumon, diarrho, cerebral, shock, ends_with("2"))


baseline_Table1 <- baseline_Table1 %>% dplyr::select(!ends_with("12"), -c(height2, weight2, datem2))

baseline_Table1.1 <- baseline_Table1 %>% 
  dplyr::select(agemonth, sex, site, case, arm, muac2, zlen2, zwei2, zwfl2, 
                pneumon, diarrho, shock, cerebral,oedema2, hg2, platelet2, wbc2, lymph2, neutrop2)


baseline_Table1.1 %>% 
  tbl_summary(by = case,
              #type = list(all_continuous() ~ "continuous2"),
              statistic = list(all_continuous() ~ c("{median} ({p25} - {p75})")),
              digits = list(all_continuous() ~ 1), 
              label = list(sex ~ "Males (%)",
                           agemonth ~ "Age at admission, months",
                           site ~ "Site",
                           sex ~ "Sex (%)",
                           arm ~ "Treatment Arm",
                           muac2 ~ "MUAC",
                           zlen2 ~ "LAZ score",
                           zwei2 ~ "WAZ score",
                           zwfl2 ~ "WLZ score",
                           oedema2 ~ "Oedema (%)",
                           pneumon ~ "Pneumonia",
                           diarrho ~ "Diarrhoea",
                           shock ~ "Shock",
                           cerebral ~ "Cerebral palsy",
                           hg2 ~ "Haemoglobin counts (g/dl)",
                           platelet2 ~ "Platelets (10^3/L)",
                           wbc2 ~ "Total WBC counts (10^3/L)",
                           lymph2 ~ "Lymphocyte counts (10^3/L)",
                           neutrop2 ~ "Neutrophil counts (10^3/L)"),
              missing = "no",
              value = list(sex ~ "Female")) %>% 
                modify_spanning_header(c("stat_1", "stat_2") ~ "**Survival Status**") %>%
                modify_header(label = "**Participants' characteristics**",
                              all_stat_cols() ~ "**{level}**\n(n={n})") %>%
                
                add_overall(last = T, col_label = "**Total**\n(n={N})") %>%
                
                add_p(list(all_continuous() ~ "t.test",
                           all_categorical() ~ "chisq.test"),
                      pvalue_fun = function(x) style_pvalue(x, digits = 1)) %>% 
  as_gt()  %>% gt::gtsave(filename = "../Figures/Tables/CTX_LPDM_Table_1_m2.docx")
## The following warnings were returned during `as_gt()`:
## ! For variable `cerebral` (`case`) and "statistic", "p.value", and "parameter"
##   statistics: Chi-squared approximation may be incorrect
## ! For variable `oedema2` (`case`) and "statistic", "p.value", and "parameter"
##   statistics: Chi-squared approximation may be incorrect
## ! For variable `shock` (`case`) and "statistic", "p.value", and "parameter"
##   statistics: Chi-squared approximation may be incorrect
## ! For variable `site` (`case`) and "statistic", "p.value", and "parameter"
##   statistics: Chi-squared approximation may be incorrect

*** Characteristics at Enrolment

baseline <- read_xlsx("../Raw_data/CTX_case_control_variables_for_James10082023.xlsx")


baseline_enrol <- baseline %>% 
  dplyr::select(subjid, agemonth, sex, site, case, arm, pneumon, diarrho, cerebral, shock, ends_with("0"))


baseline_enrol <- baseline_enrol %>% dplyr::select(!ends_with("10"), -c(height0, weight0))

baseline_enrol <- baseline %>% 
  dplyr::select(agemonth, sex, site, case, arm, muac0, zlen0, zwei0, zwfl0, 
                pneumon, diarrho, shock, cerebral,oedema0, ehg, eplatelet, ewbc, elymph, eneutrop)


baseline_enrol %>% 
  tbl_summary(by = case,
              statistic = list(all_continuous() ~ c("{median} ({p25} - {p75})")),
              digits = list(all_continuous() ~ 1), 
              label = list(
                           agemonth ~ "Age at admission, months",
                           site ~ "Site",
                           sex ~ "Sex (%)",
                           arm ~ "",
                           muac0 ~ "MUAC",
                           zlen0 ~ "LAZ score",
                           zwei0 ~ "WAZ score",
                           zwfl0 ~ "WLZ score",
                           oedema0 ~ "Oedema (%)",
                           pneumon ~ "Pneumonia",
                           diarrho ~ "Diarrhoea",
                           shock ~ "Shock",
                           cerebral ~ "Cerebral palsy",
                           ehg ~ "Haemoglobin counts (g/dl)",
                           eplatelet ~ "Platelets (10^3/L)",
                           ewbc ~ "Total WBC counts (10^3/L)",
                           elymph ~ "Lymphocyte counts (10^3/L)",
                           eneutrop ~ "Neutrophil counts (10^3/L)"),
              missing = "no",
              value = list(sex ~ "Male")) %>% 
                modify_spanning_header(c("stat_1", "stat_2") ~ "**Survival Status**") %>%
                modify_header(label = "**Participants' characteristics**",
                              all_stat_cols() ~ "**{level}**\n(n={n})") %>%
                
                add_overall(last = T, col_label = "**Total**\n(n={N})") %>%
                
                add_p(list(all_continuous() ~ "t.test",
                           all_categorical() ~ "chisq.test"),
                      pvalue_fun = function(x) style_pvalue(x, digits = 1)) %>% as_gt()  %>% 
  gt::gtsave(filename = "../Figures/Tables/CTX_LPDM_Table_1_enrolment.docx")
## The following warnings were returned during `as_gt()`:
## ! For variable `cerebral` (`case`) and "statistic", "p.value", and "parameter"
##   statistics: Chi-squared approximation may be incorrect
## ! For variable `shock` (`case`) and "statistic", "p.value", and "parameter"
##   statistics: Chi-squared approximation may be incorrect
## ! For variable `site` (`case`) and "statistic", "p.value", and "parameter"
##   statistics: Chi-squared approximation may be incorrect

*** Characteristics at month 2 stratified by sex

baseline <- read_xlsx("../Raw_data/CTX_case_control_variables_for_James10082023.xlsx")


baseline_Table1 <- baseline %>% 
  dplyr::select(subjid, agemonth, sex, site, case, arm, pneumon, diarrho, cerebral, shock, ends_with("2"))


baseline_Table1 <- baseline_Table1 %>% dplyr::select(!ends_with("12"), -c(height2, weight2, datem2))

baseline_Table1.1 <- baseline_Table1 %>% 
  dplyr::select(agemonth, sex, site, case, arm, muac2, zlen2, zwei2, zwfl2, 
                pneumon, diarrho, shock, cerebral,oedema2, hg2, platelet2, wbc2, lymph2, neutrop2)

baseline_Table1.1 %>% 
  tbl_summary(by = case,
              type = list(all_continuous() ~ "continuous2"),
              statistic = list(all_continuous() ~ c("{median} ({p25} - {p75})")),
              digits = list(all_continuous() ~ 1), 
              label = list(sex ~ "Males (%)",
                           agemonth ~ "Age at admission, months",
                           site ~ "Site",
                           sex ~ "Sex (%)",
                           arm ~ "",
                           muac2 ~ "MUAC",
                           zlen2 ~ "LAZ score",
                           zwei2 ~ "WAZ score",
                           zwfl2 ~ "WLZ score",
                           oedema2 ~ "Oedema (%)",
                           pneumon ~ "Pneumonia",
                           diarrho ~ "Diarrhoea",
                           shock ~ "Shock",
                           cerebral ~ "Cerebral palsy",
                           hg2 ~ "Haemoglobin counts (g/dl)",
                           platelet2 ~ "Platelets (10^3/L)",
                           wbc2 ~ "Total WBC counts (10^3/L)",
                           lymph2 ~ "Lymphocyte counts (10^3/L)",
                           neutrop2 ~ "Neutrophil counts (10^3/L)"),
              missing = "no",
              value = list(sex ~ "Female")) %>% 
                modify_spanning_header(c("stat_1", "stat_2") ~ "**Survival Status**") %>%
                modify_header(label = "**Participants' characteristics**",
                              all_stat_cols() ~ "**{level}**\n(n={n})") %>%
                
                add_overall(last = T, col_label = "**Total**\n(n={N})") %>%
                
                add_p(list(all_continuous() ~ "t.test",
                           all_categorical() ~ "chisq.test"),
                      pvalue_fun = function(x) style_pvalue(x, digits = 1)) %>% 
  as_gt()  %>% gt::gtsave(filename = "../Figures/Tables/Table1bysex.docx")
## The following warnings were returned during `as_gt()`:
## ! For variable `cerebral` (`case`) and "statistic", "p.value", and "parameter"
##   statistics: Chi-squared approximation may be incorrect
## ! For variable `oedema2` (`case`) and "statistic", "p.value", and "parameter"
##   statistics: Chi-squared approximation may be incorrect
## ! For variable `shock` (`case`) and "statistic", "p.value", and "parameter"
##   statistics: Chi-squared approximation may be incorrect
## ! For variable `site` (`case`) and "statistic", "p.value", and "parameter"
##   statistics: Chi-squared approximation may be incorrect

*** Table of anthropometry and haematology

baseline <- read_xlsx("../Raw_data/CTX_case_control_variables_for_James10082023.xlsx")

## month2 
baseline_Table1 <- baseline %>% 
  dplyr::select(subjid, agemonth, sex, site, case, arm, pneumon, diarrho, cerebral, shock, ends_with("2"))


baseline_Table1 <- baseline_Table1 %>% dplyr::select(!ends_with("12"), -c(height2, weight2, datem2))

baseline_Table_M2 <- baseline_Table1 %>% 
  dplyr::select(subjid, agemonth, sex, site, case, arm, muac2, zlen2, zwei2, zwfl2, 
                pneumon, diarrho, shock, cerebral,oedema2, hg2, platelet2, wbc2, lymph2, neutrop2)

##Enrolment
baseline_Table_enrol <- baseline %>% 
  dplyr::select(subjid, agemonth, sex, site, case, arm, pneumon, diarrho, cerebral, shock,ehg, elymph, eplatelet, ewbc, eneutrop, ends_with("0"))


baseline_Table_enrol <- baseline_Table_enrol %>% dplyr::select(!ends_with("10"))

baseline_Table_enrol <- baseline_Table_enrol %>% 
    dplyr::select(subjid,agemonth, sex, site, case, arm, muac0, zlen0, zwei0,
                  zwfl0,pneumon, diarrho, shock, cerebral,oedema0,
                  ehg,eplatelet, ewbc, elymph, eneutrop)

base_enrol <- baseline_Table_enrol %>% dplyr::select(c(1,5,7:10,16:20))
base_enrol$Timepoint <- "Enrolment"


base_M2 <- baseline_Table_M2 %>% dplyr::select(c(1,5,7:10,16:20))
base_M2$Timepoint <- "Month 2"

base_enrol <- base_enrol %>% 
  pivot_longer(cols = -c(subjid,Timepoint, case),
               names_to = "haematology",
               values_to = "values")

base_M2 <- base_M2 %>% 
  pivot_longer(cols = -c(subjid,Timepoint,case),
               names_to = "haematology",
               values_to = "values")

combined_anthro_hema <- bind_rows(base_enrol,base_M2)

combined_anthro_hema <- combined_anthro_hema %>% 
  dplyr::mutate(haematology = 
                  case_when(haematology=="ehg"| haematology == "hg2" ~ "Haemoglobin",
                            haematology=="eplatelet" | haematology == "platelet2" ~"Platelets",
                            haematology=="ewbc" | haematology =="wbc2" ~ "WBCs",
                            haematology=="elymph" | haematology == "lymph2" ~ "Lymphocytes",
                            haematology=="eneutrop"| haematology == "neutrop2" ~ "Neutrophils", 
                            haematology=="muac0"| haematology == "muac2" ~ "MUAC", 
                            haematology=="zlen0"| haematology == "zlen2" ~ "LAZ", 
                            haematology=="zwei0"| haematology == "zwei2" ~ "WAZ", 
                            haematology=="zwfl0"| haematology == "zwfl2" ~ "WLZ", 
                            TRUE ~ haematology))

combined_anthro_hema <- combined_anthro_hema %>% 
  dplyr::mutate(case = 
                  case_when(case=="Case" ~ "Cases",
                            case=="Control" ~ "Controls"))


hematology_table1 <- combined_anthro_hema #%>% dplyr::select(-c(subjid))

hematology_table_wide <- hematology_table1 %>% 
  pivot_wider(names_from = haematology,
              values_from = values)

paired_wilcox <- function(data, variable, by, ...) {
  test_result <- wilcox.test(
    formula = as.formula(paste(variable, "~", by)),
    data = data,
    paired = TRUE
  )
  tibble::tibble(p.value = test_result$p.value)
}

hematology_table_wide %>% dplyr::select(-c(subjid)) %>% 
  tbl_strata(
    strata = case,
    .tbl_fun = ~ .x %>%
      tbl_summary(
        by = Timepoint,
        #type = list(all_continuous() ~ "continuous2"),
        statistic = list(all_continuous() ~ "{median} ({p25} - {p75})"),
        digits = list(all_continuous() ~ 1),
        label = list(
          MUAC ~ "MUAC in (cm)",
           LAZ ~ "LAZ",
           WAZ~ "WAZ",
           WLZ ~ "WLZ",
          Haemoglobin ~ "Haemoglobin (g/dl)",
          Platelets ~ "Platelets (10^3/L)",
          WBCs ~ "WBCs (10^3/L)",
          Lymphocytes ~ "Lymphocyte (10^3/L)",
          Neutrophils ~ "Neutrophil (10^3/L)"
        ),
        missing = "no"
      ) %>%
      #  add_p(list(all_continuous() ~ "wilcox.test"
      #   ),
      #   pvalue_fun = function(x) style_pvalue(x, digits = 3)
      # ) %>%
      modify_header(
        label = "**Participants' Measurements**",
        all_stat_cols() ~ "**{level}**\n(n={n})"
      ),
    .combine_with = "tbl_merge"
  ) %>%
  as_gt() %>%
  gt::gtsave("../Figures/Tables/CTX_LPDM_Table_haematologyand anthropometry_by_timepoint_case.docx")
baseline <- read_xlsx("../Raw_data/CTX_case_control_variables_for_James10082023.xlsx")

## month2 
baseline_Table1 <- baseline %>% 
  dplyr::select(subjid, agemonth, sex, site, case, arm, pneumon, diarrho, cerebral, shock, ends_with("2"))


baseline_Table1 <- baseline_Table1 %>% dplyr::select(!ends_with("12"), -c(height2, weight2, datem2))

baseline_Table_M2 <- baseline_Table1 %>% 
  dplyr::select(subjid, agemonth, sex, site, case, arm, muac2, zlen2, zwei2, zwfl2, 
                pneumon, diarrho, shock, cerebral,oedema2, hg2, platelet2, wbc2, lymph2, neutrop2)

##Enrolment
baseline_Table_enrol <- baseline %>% 
  dplyr::select(subjid, agemonth, sex, site, case, arm, pneumon, diarrho, cerebral, shock,ehg, elymph, eplatelet, ewbc, eneutrop, ends_with("0"))


baseline_Table_enrol <- baseline_Table_enrol %>% dplyr::select(!ends_with("10"))

baseline_Table_enrol <- baseline_Table_enrol %>% 
    dplyr::select(subjid,agemonth, sex, site, case, arm, muac0, zlen0, zwei0,
                  zwfl0,pneumon, diarrho, shock, cerebral,oedema0,
                  ehg,eplatelet, ewbc, elymph, eneutrop)

base_enrol <- baseline_Table_enrol %>% dplyr::select(c(1,5,7:10,16:20))
base_enrol$Timepoint <- "Enrolment"


base_M2 <- baseline_Table_M2 %>% dplyr::select(c(1,5,7:10,16:20))
base_M2$Timepoint <- "Month 2"


combined_anthro_regression <- bind_cols(base_enrol,base_M2)
## New names:
## • `subjid` -> `subjid...1`
## • `case` -> `case...2`
## • `Timepoint` -> `Timepoint...12`
## • `subjid` -> `subjid...13`
## • `case` -> `case...14`
## • `Timepoint` -> `Timepoint...24`
combined_anthro_regression <- combined_anthro_regression %>% 
  dplyr::select(-c(starts_with("Timepoint"),"subjid...13","case...14")) %>% 
  dplyr::rename(subjid=subjid...1,case=case...2)

combined_anthro_regression <- baseline %>%
  dplyr::select(subjid,agemonth,site,sex) %>% 
  left_join(., combined_anthro_regression, join_by(subjid))

combined_anthro_regression <- combined_anthro_regression %>% 
  dplyr::mutate(MUAC = muac2 - muac0,
                WAZ = zwei2 - zwei0,
                WHZ = zwfl2 - zwfl0,
                HAZ = zlen2 - zlen0,
                HB = hg2 - ehg,
                Platelets = platelet2 - eplatelet,
                WBCs = wbc2 - ewbc,
                Lymphocytes = lymph2 - elymph,
                Neutrophils = neutrop2 - eneutrop)


ModuleEigen_df <- combined_anthro_regression %>% 
  dplyr::select(MUAC:Neutrophils)

MinModuleSize10_ModuleEigen <- combined_anthro_regression

MinModuleSize10_ModuleEigen <- MinModuleSize10_ModuleEigen %>% 
  dplyr::mutate(case_recoded = case_when(case == "Case" ~ 1,
                                         TRUE ~ 0))

combined_anthro_regression$case <- factor(combined_anthro_regression$case,
                                          levels = c("Case", "Control"))
# 
# combined_anthro_regression$case <- factor(combined_anthro_regression$case,
#                                           levels = c("Case", "Control"))

Adjustedmod_summaries <- list()  
 
for (col in colnames(ModuleEigen_df)) {   

  modules = MinModuleSize10_ModuleEigen[col]

  Adjustedmod_summaries[[col]] <- lme4::glmer(case_recoded ~ modules[[col]] +
                                                     +  sex + muac0 +
                                                   agemonth + (1|site),
                                              family = "binomial",
                                                 data = MinModuleSize10_ModuleEigen)

}
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
coef_estimates <- list()
 
for (col in colnames(ModuleEigen_df)) {

  # Extract the fixed effects coefficients

  fixed_effects_df <- broom.mixed::tidy(Adjustedmod_summaries[[col]])  #%>%  as.data.frame()

  # Append the coefficient data frame to the list

  coef_estimates[[col]] <- fixed_effects_df

}
 
# Combine the coefficient data frames into a single data frame

Coefficient_etimate_df <- do.call(rbind, coef_estimates)
 
Coefficient_etimate_df <- Coefficient_etimate_df %>% 
  rownames_to_column(var = "Measurements")
 
Coefficient_etimate_df <- Coefficient_etimate_df %>% 
  dplyr::filter(term %in% "modules[[col]]")
 
Coefficient_etimate_df_clean <- Coefficient_etimate_df %>%

  separate_wider_delim(Measurements, delim = ".", 
                       names = c("Measurements", "Status")) %>% 

  dplyr::select(!c(Status, effect, group, term))
 
 
Coefficient_etimate_df_clean$adjusted_p.value <- p.adjust(Coefficient_etimate_df_clean$p.value, method = "bonferroni") # fdr
 
Coefficient_etimate_df_clean <- Coefficient_etimate_df_clean %>%

  group_by(Measurements) %>%

  mutate(

    Lower = estimate - 1.96 * std.error,

    Upper = estimate + 1.96 * std.error

  ) %>%

  ungroup()
 
 
# Create a new column 'significance' based on the conditions

Coefficient_etimate_df_clean$significance <- 

  ifelse(Coefficient_etimate_df_clean$p.value <= 0.05, "Yes", "No")

writexl::write_xlsx(Coefficient_etimate_df_clean,"../Figures/Tables/Baseline_characteristics_change_between_casesand_controls_muacAdjust.xlsx") 

Survival analysis at Month 2

*** Data preprocessing *** 4plex

luminex_4plex <- read_csv("../Raw_data/rawdata_4plex_m2.csv")
## Rows: 95 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): subjid, ADAMTS13-2, Angiopoietin-2-2, D-dimer-2, Thrombomodulin-2
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
baseline <- read_xlsx("../Raw_data/CTX_case_control_variables_for_James10082023.xlsx")

luminex_4plex_cleaned_numeric <- luminex_4plex %>% 
  column_to_rownames(var = "subjid") 

## Log normalization and Autoscaling

luminex_4plex_cleaned_numeric_log <- log(na.omit(luminex_4plex_cleaned_numeric))

luminex_4plex_cleaned_numeric_log_auto_scale <- mdatools::prep.autoscale(as.matrix(luminex_4plex_cleaned_numeric_log), 
center = TRUE, scale = TRUE)


luminex_4plex_cleaned_numeric_log_auto_scale <- as.data.frame(luminex_4plex_cleaned_numeric_log_auto_scale)


### Outlier replacement

capOutlier <- function(x){

  qnt <- quantile(x, probs=c(.25, .75), na.rm = T)

  caps <- quantile(x, probs=c(.1, .90), na.rm = T)

  H <- 1.5 * IQR(x, na.rm = T)

  x[x < (qnt[1] - H)] <- caps[1]

  x[x > (qnt[2] + H)] <- caps[2]

  return(x)

}

luminex_4plex_cleaned_numeric_log_auto_scale_long_Outliers_removed <- luminex_4plex_cleaned_numeric_log_auto_scale %>% mutate_at(c(1:4), capOutlier)

luminex_4plex_cleaned_numeric_log_auto_scale_long_Outliers_removed <- luminex_4plex_cleaned_numeric_log_auto_scale_long_Outliers_removed %>% 
  rownames_to_column(var = "subjid")

## Combine with baseline data
baseline$subjid <- as.character(baseline$subjid)

luminex_4plex_meta <- luminex_4plex_cleaned_numeric_log_auto_scale_long_Outliers_removed %>% 
  dplyr::left_join(baseline, by = "subjid")

cases_id <- luminex_4plex_meta %>% dplyr::select(subjid, case)


luminex_4plex_cleaned_numeric_log_auto_scale_long_Outliers_removed <- luminex_4plex_cleaned_numeric_log_auto_scale_long_Outliers_removed %>% 
  dplyr::inner_join(., cases_id, by = "subjid")

write_csv(luminex_4plex_cleaned_numeric_log_auto_scale_long_Outliers_removed,"../Processed_data/luminex_4plex_cleaned_numeric_log_auto_scale_long_Outliers_removed.csv")

** Hazard ratio MUAC adjusted - 4plex month 2

luminex_4plex_cleaned_numeric_log_auto_scale_long_Outliers_removed <- read_csv("../Processed_data/luminex_4plex_cleaned_numeric_log_auto_scale_long_Outliers_removed.csv")
## Rows: 95 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): case
## dbl (5): subjid, ADAMTS13-2, Angiopoietin-2-2, D-dimer-2, Thrombomodulin-2
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
baseline <- read_xlsx("../Raw_data/CTX_case_control_variables_for_James10082023.xlsx")

enroldate <- read_csv("../Raw_data/CTX_James_dates_22042025.csv")
## Rows: 128 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): dateenrd, discdate, death_date, case, enddate
## dbl (1): subjid
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(enroldate) <- paste(colnames(enroldate), "T2E", sep = "_")

baseline_time2death <- baseline %>% 
  dplyr::left_join(., enroldate, join_by(subjid == subjid_T2E))


baseline_time2death$enddate_T2E <- as.Date(baseline_time2death$enddate_T2E, format = "%d%b%Y")

baseline_time2death$dateenrd_T2E <- as.Date(baseline_time2death$dateenrd_T2E, format = "%d%b%Y")

write_csv(baseline_time2death,"../Processed_data/baseline_time2death.csv")


baseline <- baseline_time2death %>% 
  dplyr::select(subjid, agemonth, sex, site, arm, case, muac0, muac2, discdate, dateenrd_T2E, datem2, enddate_T2E)


luminex_4plex_cleaned_numeric_log_auto_scale_long_Outliers_removed <- luminex_4plex_cleaned_numeric_log_auto_scale_long_Outliers_removed %>% dplyr::select(!case)


baseline <- baseline %>% 
  dplyr::mutate(case_recoded = case_when(case == "Case" ~ 1,
                                         case == "Control" ~ 0))

luminex_4plex_HR <- luminex_4plex_cleaned_numeric_log_auto_scale_long_Outliers_removed %>% dplyr::left_join(., baseline, join_by(subjid))

luminex_4plex_HR$time2_death <- as.numeric(difftime(luminex_4plex_HR$enddate_T2E, luminex_4plex_HR$datem2, units = "days"))

write_csv(luminex_4plex_HR,"../Processed_data/Model_data/4plex_m2.csv")

protein_cols <- luminex_4plex_HR %>% 
  dplyr::select(`ADAMTS13-2`:`Thrombomodulin-2`)

protein_data <- luminex_4plex_HR
 
# Initialize lists to store results

adjusted_results <- list()
 
for (col in colnames(protein_cols)) {
  protein = protein_data[col]
  adjusted_results[[col]] <- coxph(Surv(time2_death, case_recoded) ~
                         protein[[col]] + agemonth + muac0 +
                           strata(sex, site, arm), data = protein_data)
  
  adjusted_results[[col]] <- tidy(adjusted_results[[col]])

}
 
adjusted_summary_df <- do.call(rbind,adjusted_results)

adjusted_summary_df <- adjusted_summary_df %>%
  dplyr::filter(term=='protein[[col]]') %>% 
  rownames_to_column(var = "cytokines") %>% 
   #slice(-c(1:5)) %>% 
  dplyr::select(-term)

adjusted_summary_df$estimate <- as.numeric(adjusted_summary_df$estimate)

adjusted_summary_df$std.error <- as.numeric(adjusted_summary_df$std.error)


adjusted_summary_df$p.value <- as.numeric(adjusted_summary_df$p.value)

final_results_adjusted <- adjusted_summary_df %>% 

  mutate(

    HR = exp(estimate),

    Lower = exp(estimate - 1.96 * std.error),

    Upper = exp(estimate + 1.96 * std.error)

  ) 

final_results_adjusted$significance <- ifelse(final_results_adjusted$p.value <= 0.05, "Yes", "No")
 
final_results_adjusted$cytokines <- sub("-2\\.1$", "", final_results_adjusted$cytokines)


write_csv(final_results_adjusted,"../Processed_data/conditionalHR_adjusted_m2_4plex.csv")

** MUAC Unadjusted Hazard ratio - 4plex month 2

luminex_4plex_cleaned_numeric_log_auto_scale_long_Outliers_removed <- read_csv("../Processed_data/luminex_4plex_cleaned_numeric_log_auto_scale_long_Outliers_removed.csv")
## Rows: 95 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): case
## dbl (5): subjid, ADAMTS13-2, Angiopoietin-2-2, D-dimer-2, Thrombomodulin-2
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
baseline <- read_xlsx("../Raw_data/CTX_case_control_variables_for_James10082023.xlsx")

enroldate <- read_csv("../Raw_data/CTX_James_dates_22042025.csv")
## Rows: 128 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): dateenrd, discdate, death_date, case, enddate
## dbl (1): subjid
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(enroldate) <- paste(colnames(enroldate), "T2E", sep = "_")

baseline_time2death <- baseline %>% 
  dplyr::left_join(., enroldate, join_by(subjid == subjid_T2E))


baseline_time2death$enddate_T2E <- as.Date(baseline_time2death$enddate_T2E, format = "%d%b%Y")

baseline_time2death$dateenrd_T2E <- as.Date(baseline_time2death$dateenrd_T2E, format = "%d%b%Y")

write_csv(baseline_time2death,"../Processed_data/baseline_time2death.csv")


baseline <- baseline_time2death %>% 
  dplyr::select(subjid, agemonth, sex, site, arm, case, muac0, muac2, discdate, dateenrd_T2E, datem2, enddate_T2E)


luminex_4plex_cleaned_numeric_log_auto_scale_long_Outliers_removed <- luminex_4plex_cleaned_numeric_log_auto_scale_long_Outliers_removed %>% dplyr::select(!case)


baseline <- baseline %>% 
  dplyr::mutate(case_recoded = case_when(case == "Case" ~ 1,
                                         case == "Control" ~ 0))

luminex_4plex_HR <- luminex_4plex_cleaned_numeric_log_auto_scale_long_Outliers_removed %>% dplyr::left_join(., baseline, join_by(subjid))

luminex_4plex_HR$time2_death <- as.numeric(difftime(luminex_4plex_HR$enddate_T2E, luminex_4plex_HR$datem2, units = "days"))

write_csv(luminex_4plex_HR,"../Processed_data/Model_data/4plex_m2.csv")

protein_cols <- luminex_4plex_HR %>% 
  dplyr::select(`ADAMTS13-2`:`Thrombomodulin-2`)

protein_data <- luminex_4plex_HR
 
# Initialize lists to store results

adjusted_results <- list()
 
for (col in colnames(protein_cols)) {
  protein = protein_data[col]
  adjusted_results[[col]] <- coxph(Surv(time2_death, case_recoded) ~
                         protein[[col]] + agemonth +
                           strata(sex, site, arm), data = protein_data)
  
  adjusted_results[[col]] <- tidy(adjusted_results[[col]])

}
 
adjusted_summary_df <- do.call(rbind,adjusted_results)

adjusted_summary_df <- adjusted_summary_df %>%
  dplyr::filter(term=='protein[[col]]') %>% 
  rownames_to_column(var = "cytokines") %>% 
   #slice(-c(1:5)) %>% 
  dplyr::select(-term)

adjusted_summary_df$estimate <- as.numeric(adjusted_summary_df$estimate)

adjusted_summary_df$std.error <- as.numeric(adjusted_summary_df$std.error)


adjusted_summary_df$p.value <- as.numeric(adjusted_summary_df$p.value)

final_results_adjusted <- adjusted_summary_df %>% 

  mutate(

    HR = exp(estimate),

    Lower = exp(estimate - 1.96 * std.error),

    Upper = exp(estimate + 1.96 * std.error)

  ) 

final_results_adjusted$significance <- ifelse(final_results_adjusted$p.value <= 0.05, "Yes", "No")
 
final_results_adjusted$cytokines <- sub("-2\\.1$", "", final_results_adjusted$cytokines)


write_csv(final_results_adjusted,"../Processed_data/UnadjustedMUAC_HR/conditionalHR_m2_4plex.csv")

** 29plex Preprocessing

luminex_29plex <- read_csv("../Raw_data/rawdata_29plex_m2.csv")
## Rows: 96 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (30): subjid, EGF-2, Eotaxin-2, G-CSF-2, GM-CSF-2, IFNa2-2, IFNg-2, IL10...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
luminex_29plex$subjid <- as.character(luminex_29plex$subjid)


luminex_29plex <- luminex_29plex %>% 
  tibble::column_to_rownames(var = "subjid") 


## Log normalization and Autoscaling

luminex_29plex_data_numeric_log <- log(na.omit(luminex_29plex))

luminex_29plex_data_numeric_log_auto_scale <- mdatools::prep.autoscale(as.matrix(luminex_29plex_data_numeric_log), 
                         center = TRUE, scale = TRUE)


luminex_29plex_data_numeric_log_auto_scale <- as.data.frame(luminex_29plex_data_numeric_log_auto_scale)


## Replacing outliers

capOutlier <- function(x){

  qnt <- quantile(x, probs=c(.25, .75), na.rm = T)

  caps <- quantile(x, probs=c(.1, .90), na.rm = T)

  H <- 1.5 * IQR(x, na.rm = T)

  x[x < (qnt[1] - H)] <- caps[1]

  x[x > (qnt[2] + H)] <- caps[2]

  return(x)

}

luminex_29plex_data_numeric_log_auto_scale_Outliers <- luminex_29plex_data_numeric_log_auto_scale %>% mutate_at(c(1:29), capOutlier)


luminex_29plex_data_numeric_log_auto_scale_Outliers <- luminex_29plex_data_numeric_log_auto_scale_Outliers %>% 
  rownames_to_column(var = "subjid")

cases_id <- baseline %>% dplyr::select(subjid,case)

cases_id$subjid <- as.character(cases_id$subjid)


luminex_29plex_data_numeric_log_auto_scale_Outliers <- luminex_29plex_data_numeric_log_auto_scale_Outliers %>% 
  dplyr::inner_join(., cases_id, by = "subjid")

case <- luminex_29plex_data_numeric_log_auto_scale_Outliers$case

write_csv(luminex_29plex_data_numeric_log_auto_scale_Outliers,"../Processed_data/luminex_29plex_data_numeric_log_auto_scale_Outliers.csv")

** MUAC adjusted model

baseline_time2death <- read_csv("../Processed_data/baseline_time2death.csv")
## Rows: 128 Columns: 127
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (26): case, site, cerebral, diarrho, epilepsy, iconsc, pneumon, prematu...
## dbl  (85): subjid, time, agemonth, ehg, elymph, muac0, eneutrop, eplatelet, ...
## dttm (14): dateadmin, dateenrd, discdate, datem1, datem2, datem3, datem4, da...
## date  (2): dateenrd_T2E, enddate_T2E
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
baseline <- read_xlsx("../Raw_data/CTX_case_control_variables_for_James10082023.xlsx")

luminex_29plex_data_numeric_log_auto_scale_Outliers <- read_csv("../Processed_data/Luminex_29plex_data_numeric_log_auto_scale_Outliers.csv")
## Rows: 96 Columns: 31
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): case
## dbl (30): subjid, EGF-2, Eotaxin-2, G-CSF-2, GM-CSF-2, IFNa2-2, IFNg-2, IL10...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
luminex_29plex_data_numeric_log_auto_scale_Outliers <- luminex_29plex_data_numeric_log_auto_scale_Outliers %>% dplyr::select(!c(case,`IL-3-2`))

baseline <- baseline_time2death %>% 
  dplyr::select(subjid, agemonth, sex, site, arm, case, discdate, muac0, dateenrd_T2E, datem2, enddate_T2E)

baseline <- baseline %>% 
  dplyr::mutate(case_recoded = case_when(case == "Case" ~ 1,
                                         case == "Control" ~ 0))

luminex_29plex_HR <- luminex_29plex_data_numeric_log_auto_scale_Outliers %>% dplyr::left_join(., baseline, join_by(subjid))

luminex_29plex_HR$subjid <- as.character(luminex_29plex_HR$subjid)


baseline$subjid <- as.character(baseline$subjid)

luminex_29plex_HR$time2_death <- as.numeric(difftime(luminex_29plex_HR$enddate_T2E, luminex_29plex_HR$datem2, units = "days"))

write_csv(luminex_29plex_HR,"../Processed_data/Model_data/29plex_m2.csv")

protein_cols <- luminex_29plex_HR %>% 
  dplyr::select(`EGF-2`:`VEGF-2`)

protein_data <- luminex_29plex_HR
 
# Initialize lists to store results

adjusted_results <- list()
 
for (col in colnames(protein_cols)) {
  protein = protein_data[col]
  adjusted_results[[col]] <- coxph(Surv(time2_death, case_recoded) ~
                         protein[[col]] + agemonth + muac0 +
                           strata(sex, site, arm), data = protein_data)
  
  adjusted_results[[col]] <- tidy(adjusted_results[[col]])

}
 
adjusted_summary_df <- do.call(rbind,adjusted_results)

adjusted_summary_df <- adjusted_summary_df %>%
  dplyr::filter(term=='protein[[col]]') %>% 
  rownames_to_column(var = "cytokines") %>% 
   #slice(-c(1:5)) %>% 
  dplyr::select(-term)

adjusted_summary_df$estimate <- as.numeric(adjusted_summary_df$estimate)

adjusted_summary_df$std.error <- as.numeric(adjusted_summary_df$std.error)


adjusted_summary_df$p.value <- as.numeric(adjusted_summary_df$p.value)

final_results_adjusted1 <- adjusted_summary_df %>% 

  mutate(

    HR = exp(estimate),

    Lower = exp(estimate - 1.96 * std.error),

    Upper = exp(estimate + 1.96 * std.error)

  ) 

final_results_adjusted$significance <- ifelse(final_results_adjusted$p.value <= 0.05, "Yes", "No")
 
final_results_adjusted$cytokines <- sub("-2\\.1$", "", final_results_adjusted$cytokines)

final_results_adjusted <- final_results_adjusted %>%
  dplyr::mutate(cytokines = case_when(
    cytokines == "IFNa2" ~ "IFNα2",
    cytokines == "IL-1a" ~ "IL-1α",
    cytokines == "IL-1b" ~ "IL-1β",
    cytokines == "TNFa" ~ "TNFα",
    cytokines == "MIP-1a" ~ "MIP-1α",
    cytokines == "TNFb" ~ "TNFβ",
    cytokines == "MIP-1b" ~ "MIP-1β",
    cytokines == "IFNg" ~ "IFN-γ",
    cytokines == "IL10" ~ "IL-10",
    TRUE ~ cytokines
  ))
 

write_csv(final_results_adjusted,"../Processed_data/conditionalHR_adjusted_m2_29plex.csv")

** Not adjusted for MUAC ** 29PLEX

baseline_time2death <- read_csv("../Processed_data/baseline_time2death.csv")
## Rows: 128 Columns: 127
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (26): case, site, cerebral, diarrho, epilepsy, iconsc, pneumon, prematu...
## dbl  (85): subjid, time, agemonth, ehg, elymph, muac0, eneutrop, eplatelet, ...
## dttm (14): dateadmin, dateenrd, discdate, datem1, datem2, datem3, datem4, da...
## date  (2): dateenrd_T2E, enddate_T2E
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
baseline <- read_xlsx("../Raw_data/CTX_case_control_variables_for_James10082023.xlsx")

luminex_29plex_data_numeric_log_auto_scale_Outliers <- read_csv("../Processed_data/Luminex_29plex_data_numeric_log_auto_scale_Outliers.csv")
## Rows: 96 Columns: 31
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): case
## dbl (30): subjid, EGF-2, Eotaxin-2, G-CSF-2, GM-CSF-2, IFNa2-2, IFNg-2, IL10...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
luminex_29plex_data_numeric_log_auto_scale_Outliers <- luminex_29plex_data_numeric_log_auto_scale_Outliers %>% dplyr::select(!c(case,`IL-3-2`))

baseline <- baseline_time2death %>% 
  dplyr::select(subjid, agemonth, sex, site, arm, case, discdate, muac0, dateenrd_T2E, datem2, enddate_T2E)

baseline <- baseline %>% 
  dplyr::mutate(case_recoded = case_when(case == "Case" ~ 1,
                                         case == "Control" ~ 0))

luminex_29plex_HR <- luminex_29plex_data_numeric_log_auto_scale_Outliers %>% dplyr::left_join(., baseline, join_by(subjid))

luminex_29plex_HR$subjid <- as.character(luminex_29plex_HR$subjid)


baseline$subjid <- as.character(baseline$subjid)

luminex_29plex_HR$time2_death <- as.numeric(difftime(luminex_29plex_HR$enddate_T2E, luminex_29plex_HR$datem2, units = "days"))

write_csv(luminex_29plex_HR,"../Processed_data/Model_data/29plex_m2.csv")

protein_cols <- luminex_29plex_HR %>% 
  dplyr::select(`EGF-2`:`VEGF-2`)

protein_data <- luminex_29plex_HR
 
# Initialize lists to store results

adjusted_results <- list()
 
for (col in colnames(protein_cols)) {
  protein = protein_data[col]
  adjusted_results[[col]] <- coxph(Surv(time2_death, case_recoded) ~
                         protein[[col]] + agemonth +
                           strata(sex, site, arm), data = protein_data)
  
  adjusted_results[[col]] <- tidy(adjusted_results[[col]])

}
 
adjusted_summary_df <- do.call(rbind,adjusted_results)

adjusted_summary_df <- adjusted_summary_df %>%
  dplyr::filter(term=='protein[[col]]') %>% 
  rownames_to_column(var = "cytokines") %>% 
   #slice(-c(1:5)) %>% 
  dplyr::select(-term)

adjusted_summary_df$estimate <- as.numeric(adjusted_summary_df$estimate)

adjusted_summary_df$std.error <- as.numeric(adjusted_summary_df$std.error)


adjusted_summary_df$p.value <- as.numeric(adjusted_summary_df$p.value)

final_results_adjusted1 <- adjusted_summary_df %>% 

  mutate(

    HR = exp(estimate),

    Lower = exp(estimate - 1.96 * std.error),

    Upper = exp(estimate + 1.96 * std.error)

  ) 

final_results_adjusted$significance <- ifelse(final_results_adjusted$p.value <= 0.05, "Yes", "No")
 
final_results_adjusted$cytokines <- sub("-2\\.1$", "", final_results_adjusted$cytokines)

final_results_adjusted <- final_results_adjusted %>%
  dplyr::mutate(cytokines = case_when(
    cytokines == "IFNa2" ~ "IFNα2",
    cytokines == "IL-1a" ~ "IL-1α",
    cytokines == "IL-1b" ~ "IL-1β",
    cytokines == "TNFa" ~ "TNFα",
    cytokines == "MIP-1a" ~ "MIP-1α",
    cytokines == "TNFb" ~ "TNFβ",
    cytokines == "MIP-1b" ~ "MIP-1β",
    cytokines == "IFNg" ~ "IFN-γ",
    cytokines == "IL10" ~ "IL-10",
    TRUE ~ cytokines
  ))
 

write_csv(final_results_adjusted,"../Processed_data/UnadjustedMUAC_HR/conditionalHR_m2_29plex.csv")

** Proteomics Month 2

proteomics_file <- read_csv("../Raw_data/LPDM_Protomics_24092024.csv")
## New names:
## Rows: 191 Columns: 613
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): timepoint, group dbl (611): ...1, record_id, A0A1Q1PRB1, A0A024CIM4,
## B7Z1N6, A0A024QZL1, A0A3...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
proteingroup <- read.delim("../Raw_data/proteinGroups_LPDM_15112021.txt")

proteomics_file_m2 <- proteomics_file %>% dplyr::filter(timepoint=="M2")

proteomics_file_m2 <- proteomics_file_m2 %>% dplyr::select(-c(timepoint,group,...1))
  
proteomics_file_m2 <- proteomics_file_m2 %>% 
  dplyr::select(!contains(c("REV__","CON__")))

protein_group_genenames <- proteingroup %>% dplyr::select(Protein.IDs,Gene.names)

protein_group_genenames$Protein.IDs <- gsub(";.*", "", protein_group_genenames$Protein.IDs)

protein_group_genenames$Gene.names <- gsub(";.*", "", protein_group_genenames$Gene.names)

protein_group_genenames$Gene.names <- na_if(protein_group_genenames$Gene.names, '')

protein_group_genenames <- protein_group_genenames %>% dplyr::mutate(Gene.names=coalesce(protein_group_genenames$Gene.names,
                                  protein_group_genenames$Protein.IDs))

proteomics_file_m2 <- proteomics_file_m2 %>% slice(-20) ##participant 671, occured twice and one was removed

#proteomics_file_m2 <- proteomics_file_m2 %>% column_to_rownames(var="record_id")

proteomics_file_uniprotID <- data.frame(Protein.IDs=colnames(proteomics_file_m2))
                                        

proteomics_file_uniprotID <- proteomics_file_uniprotID %>% left_join(.,protein_group_genenames,join_by(Protein.IDs))


#colnames(proteomics_file_m2) <- proteomics_file_uniprotID$Gene.names

#proteomics_file_m2 <- proteomics_file_m2 %>% rownames_to_column(var="record_id")
metadata <- read_xlsx("../Raw_data/CTX_case_control_variables_for_James10082023.xlsx")
proteingroup_metadata <- proteomics_file_m2 %>% 
  dplyr::left_join(.,metadata, join_by(record_id==subjid))


## Subset month 2 data
proteingroup_metadata_m2 <- proteingroup_metadata

 write_csv(proteingroup_metadata_m2,"../Processed_data/proteingroup_metadata_m2_clean.csv")
 ## Filtering 80% missing values
 
 ##Filtering missing values. Proteins missing in 80% of the samples were filtered out. 
 

## Filtering missing data with EdgeR

proteingroup_metadata_m2_intensities <- proteingroup_metadata_m2 %>% 
  dplyr::select(A0A1Q1PRB1:V9HWI6)


edgeR_object<-DGEList(counts = as.matrix(t(proteingroup_metadata_m2_intensities)),samples = proteingroup_metadata_m2,group = proteingroup_metadata_m2$case,remove.zeros = T)
## Removing 10 rows with all zero counts
keep<-filterByExpr(edgeR_object,min.count = 0.1, 
                   min.total.count = 0.1, 
                   large.n = 10, min.prop = 0.8)
edgeR_object<-edgeR_object[keep,]


filtered_proteomics_data <- as.data.frame(t(edgeR_object[["counts"]]))


filtered_sample_data <- as.data.frame(edgeR_object[["samples"]])

## Imputation

 ### Replacing 0 with NA

filtered_proteomics_data[filtered_proteomics_data==0] <- NA


## Imputation using KNN

imp_knn = impute.knn(as.matrix(filtered_proteomics_data,
                     k = 10, rowmax = 0.5, colmax = 0.8,
                     maxp = 1500, rng.seed=362436069))
### Accessing the data
filtered_proteomics_data_knn <- as.data.frame(imp_knn$data) 


## Normalization and replacing outliers
 ### Log normalization
protein_data_arranged_m2_log <- log(na.omit(filtered_proteomics_data_knn))

protein_data_arranged_m2_log_auto_scale <- mdatools::prep.autoscale(as.matrix(protein_data_arranged_m2_log),                                                           center = TRUE, scale = TRUE)


protein_data_arranged_m2_log_auto_scale <- as.data.frame(protein_data_arranged_m2_log_auto_scale)

 ## Replacing outliers 

capOutlier <- function(x){

  qnt <- quantile(x, probs=c(.25, .75), na.rm = T)

  caps <- quantile(x, probs=c(.1, .90), na.rm = T)

  H <- 1.5 * IQR(x, na.rm = T)

  x[x < (qnt[1] - H)] <- caps[1]

  x[x > (qnt[2] + H)] <- caps[2]

  return(x)

}

protein_data_arranged_m2_log_auto_scale_Outliers_removed <- protein_data_arranged_m2_log_auto_scale %>% mutate_at(c(1:374), capOutlier)

write_csv(protein_data_arranged_m2_log_auto_scale_Outliers_removed,"../Processed_data/protein_data_arranged_m2_log_auto_scale_Outliers_removed.csv")

** MUAC adjusted proteins

protein_data_arranged_m2_log_auto_scale_Outliers_removed <- read_csv("../Processed_data/protein_data_arranged_m2_log_auto_scale_Outliers_removed_cleanJN.csv")
## Rows: 98 Columns: 374
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (374): A0A024CIM4, A0A024QZL1, A0A384NYN5, A0A024R035, H0Y612, Q9UFA1, K...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
proteingroup_metadata_m2 <- read_csv("../Processed_data/proteingroup_metadata_m2_clean.csv")
## Rows: 98 Columns: 711
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (23): case, site, cerebral, diarrho, epilepsy, iconsc, pneumon, premat...
## dbl  (674): record_id, A0A1Q1PRB1, A0A024CIM4, B7Z1N6, A0A024QZL1, A0A384NYN...
## dttm  (14): dateadmin, dateenrd, discdate, datem1, datem2, datem3, datem4, d...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
protein_data_arranged_m2_log_auto_scale_Outliers_removed$subjid <- 
  proteingroup_metadata_m2$record_id

protein_data_arranged_m2_log_auto_scale_Outliers_removed <- 
  protein_data_arranged_m2_log_auto_scale_Outliers_removed %>% 
  dplyr::relocate(subjid,.before = A0A024CIM4)

baseline_time2death <- read_csv("../Processed_data/baseline_time2death.csv")
## Rows: 128 Columns: 127
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (26): case, site, cerebral, diarrho, epilepsy, iconsc, pneumon, prematu...
## dbl  (85): subjid, time, agemonth, ehg, elymph, muac0, eneutrop, eplatelet, ...
## dttm (14): dateadmin, dateenrd, discdate, datem1, datem2, datem3, datem4, da...
## date  (2): dateenrd_T2E, enddate_T2E
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
baseline <- baseline_time2death %>% 
  dplyr::select(subjid, agemonth, sex, site, arm, muac0, case, discdate, 
                dateenrd_T2E, datem2, enddate_T2E)

baseline <- baseline %>% 
  dplyr::mutate(case_recoded = case_when(case == "Case" ~ 1,
                                         case == "Control" ~ 0))

proteomics_HR <- protein_data_arranged_m2_log_auto_scale_Outliers_removed %>% dplyr::left_join(., baseline, join_by(subjid))

proteomics_HR$time2_death <- as.numeric(difftime(proteomics_HR$enddate_T2E, 
                    proteomics_HR$datem2, units = "days"))

#write_csv(proteomics_HR,"../Processed_data/Model_data/proteomics_m2.csv")
###getting gene names
proteingroup <- read.delim("../Raw_data/proteinGroups_LPDM_15112021.txt")
proteomics_file_uniprotID <- proteingroup %>% 
  dplyr::select(c(Gene.names,Protein.IDs))

proteomics_file_uniprotID$Protein.IDs <- gsub(";.*", "", proteomics_file_uniprotID$Protein.IDs)

proteomics_file_uniprotID$Gene.names <- gsub(";.*", "", proteomics_file_uniprotID$Gene.names)

proteomics_file_uniprotID$Gene.names <- na_if(proteomics_file_uniprotID$Gene.names, '')

proteomics_file_uniprotID <- proteomics_file_uniprotID %>% dplyr::mutate(Gene.names=coalesce(proteomics_file_uniprotID$Gene.names,
                                  proteomics_file_uniprotID$Protein.IDs))

df2_split <- proteomics_file_uniprotID

protein_cols <- proteomics_HR %>% 
  dplyr::select(A0A024CIM4:V9HWI6)

protein_data <- proteomics_HR
 
# Initialize lists to store results



adjusted_summaryProt <- list()

adjusted_results <- list()

# protein_data$case_recoded <- as.factor(protein_data$case_recoded)
 
for (col in colnames(protein_cols)) {
  protein = protein_data[col]
  adjusted_results[[col]] <- coxph(Surv(time2_death, case_recoded) ~
                         protein[[col]] + agemonth + muac0 +
                           strata(sex, site, arm), data = protein_data)
  
  adjusted_summaryProt[[col]] <- tidy(adjusted_results[[col]])

}
 
adjusted_summary_dfProt <- do.call(rbind,adjusted_summaryProt )

adjusted_summary_dfProt <- adjusted_summary_dfProt %>% 
  rownames_to_column(var = "cytokines") # %>% 
  # slice(-c(1:5)) %>% 
  # dplyr::select(-term)

adjusted_summary_dfProt <- adjusted_summary_dfProt %>% 
  dplyr::filter(term %in% "protein[[col]]") %>% 
  dplyr::select(-term)

adjusted_summary_dfProt$cytokines <- gsub(".1$", "", adjusted_summary_dfProt$cytokines)
 
adjusted_summary_dfProt$estimate <- as.numeric(adjusted_summary_dfProt$estimate)

adjusted_summary_dfProt$std.error <- as.numeric(adjusted_summary_dfProt$std.error)


adjusted_summary_dfProt$p.value <- as.numeric(adjusted_summary_dfProt$p.value)

final_results_adjusted <- adjusted_summary_dfProt %>% 

  mutate(

    HR = exp(estimate),

    Lower = exp(estimate - 1.96 * std.error),

    Upper = exp(estimate + 1.96 * std.error)

  ) 


final_results_adjusted$significance <- ifelse(final_results_adjusted$p.value <= 0.05, "Yes", "No")

final_results_adjusted <- final_results_adjusted %>% 
  dplyr::left_join(., df2_split, join_by(cytokines == Protein.IDs))

 final_results_adjusted <- final_results_adjusted %>% 
   dplyr::relocate(Gene.names, .after = cytokines)
 

#write_csv(final_results_adjusted,"../Processed_data/conditionalHR_adjusted_proteinm2.csv")

** MUAC Unadjusted proteins

protein_data_arranged_m2_log_auto_scale_Outliers_removed <- read_csv("../Processed_data/protein_data_arranged_m2_log_auto_scale_Outliers_removed_cleanJN.csv")
## Rows: 98 Columns: 374
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (374): A0A024CIM4, A0A024QZL1, A0A384NYN5, A0A024R035, H0Y612, Q9UFA1, K...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
proteingroup_metadata_m2 <- read_csv("../Processed_data/proteingroup_metadata_m2_clean.csv")
## Rows: 98 Columns: 711
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (23): case, site, cerebral, diarrho, epilepsy, iconsc, pneumon, premat...
## dbl  (674): record_id, A0A1Q1PRB1, A0A024CIM4, B7Z1N6, A0A024QZL1, A0A384NYN...
## dttm  (14): dateadmin, dateenrd, discdate, datem1, datem2, datem3, datem4, d...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
protein_data_arranged_m2_log_auto_scale_Outliers_removed$subjid <- 
  proteingroup_metadata_m2$record_id

protein_data_arranged_m2_log_auto_scale_Outliers_removed <- 
  protein_data_arranged_m2_log_auto_scale_Outliers_removed %>% 
  dplyr::relocate(subjid,.before = A0A024CIM4)

baseline_time2death <- read_csv("../Processed_data/baseline_time2death.csv")
## Rows: 128 Columns: 127
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (26): case, site, cerebral, diarrho, epilepsy, iconsc, pneumon, prematu...
## dbl  (85): subjid, time, agemonth, ehg, elymph, muac0, eneutrop, eplatelet, ...
## dttm (14): dateadmin, dateenrd, discdate, datem1, datem2, datem3, datem4, da...
## date  (2): dateenrd_T2E, enddate_T2E
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
baseline <- baseline_time2death %>% 
  dplyr::select(subjid, agemonth, sex, site, arm, muac0, case, discdate, 
                dateenrd_T2E, datem2, enddate_T2E)

baseline <- baseline %>% 
  dplyr::mutate(case_recoded = case_when(case == "Case" ~ 1,
                                         case == "Control" ~ 0))

proteomics_HR <- protein_data_arranged_m2_log_auto_scale_Outliers_removed %>% dplyr::left_join(., baseline, join_by(subjid))

proteomics_HR$time2_death <- as.numeric(difftime(proteomics_HR$enddate_T2E, 
                    proteomics_HR$datem2, units = "days"))

#write_csv(proteomics_HR,"../Processed_data/Model_data/proteomics_m2.csv")
###getting gene names
proteingroup <- read.delim("../Raw_data/proteinGroups_LPDM_15112021.txt")
proteomics_file_uniprotID <- proteingroup %>% 
  dplyr::select(c(Gene.names,Protein.IDs))

proteomics_file_uniprotID$Protein.IDs <- gsub(";.*", "", proteomics_file_uniprotID$Protein.IDs)

proteomics_file_uniprotID$Gene.names <- gsub(";.*", "", proteomics_file_uniprotID$Gene.names)

proteomics_file_uniprotID$Gene.names <- na_if(proteomics_file_uniprotID$Gene.names, '')

proteomics_file_uniprotID <- proteomics_file_uniprotID %>% dplyr::mutate(Gene.names=coalesce(proteomics_file_uniprotID$Gene.names,
                                  proteomics_file_uniprotID$Protein.IDs))

df2_split <- proteomics_file_uniprotID

protein_cols <- proteomics_HR %>% 
  dplyr::select(A0A024CIM4:V9HWI6)

protein_data <- proteomics_HR
 
# Initialize lists to store results



adjusted_summaryProt <- list()

adjusted_results <- list()

# protein_data$case_recoded <- as.factor(protein_data$case_recoded)
 
for (col in colnames(protein_cols)) {
  protein = protein_data[col]
  adjusted_results[[col]] <- coxph(Surv(time2_death, case_recoded) ~
                         protein[[col]] + agemonth +
                           strata(sex, site, arm), data = protein_data)
  
  adjusted_summaryProt[[col]] <- tidy(adjusted_results[[col]])

}
 
adjusted_summary_dfProt <- do.call(rbind,adjusted_summaryProt )

adjusted_summary_dfProt <- adjusted_summary_dfProt %>% 
  rownames_to_column(var = "cytokines") # %>% 
  # slice(-c(1:5)) %>% 
  # dplyr::select(-term)

adjusted_summary_dfProt <- adjusted_summary_dfProt %>% 
  dplyr::filter(term %in% "protein[[col]]") %>% 
  dplyr::select(-term)

adjusted_summary_dfProt$cytokines <- gsub(".1$", "", adjusted_summary_dfProt$cytokines)
 
adjusted_summary_dfProt$estimate <- as.numeric(adjusted_summary_dfProt$estimate)

adjusted_summary_dfProt$std.error <- as.numeric(adjusted_summary_dfProt$std.error)


adjusted_summary_dfProt$p.value <- as.numeric(adjusted_summary_dfProt$p.value)

final_results_adjusted <- adjusted_summary_dfProt %>% 

  mutate(

    HR = exp(estimate),

    Lower = exp(estimate - 1.96 * std.error),

    Upper = exp(estimate + 1.96 * std.error)

  ) 


final_results_adjusted$significance <- ifelse(final_results_adjusted$p.value <= 0.05, "Yes", "No")

final_results_adjusted <- final_results_adjusted %>% 
  dplyr::left_join(., df2_split, join_by(cytokines == Protein.IDs))

 final_results_adjusted <- final_results_adjusted %>% 
   dplyr::relocate(Gene.names, .after = cytokines)
 

write_csv(final_results_adjusted,"../Processed_data/UnadjustedMUAC_HR/conditionalHR_proteinm2.csv")

** Hematological factors MUAC adjusted - Month2

baseline_time2death <- read_csv("../Processed_data/baseline_time2death.csv")
## Rows: 128 Columns: 127
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (26): case, site, cerebral, diarrho, epilepsy, iconsc, pneumon, prematu...
## dbl  (85): subjid, time, agemonth, ehg, elymph, muac0, eneutrop, eplatelet, ...
## dttm (14): dateadmin, dateenrd, discdate, datem1, datem2, datem3, datem4, da...
## date  (2): dateenrd_T2E, enddate_T2E
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
baseline_hema <- baseline_time2death %>%
  dplyr::mutate(case_recoded = case_when(case=="Case"~1,case=="Control"~0)) 

hematological_data <- baseline_hema %>% 
  dplyr::select(c(subjid, agemonth, sex, site, arm, case,muac0,hg2, lymph2, neutrop2, platelet2, wbc2, discdate, dateenrd_T2E,datem2, enddate_T2E, case_recoded))

hematological_data$time2_death <- as.numeric(difftime(hematological_data$dateenrd_T2E, hematological_data$datem2, units = "days"))
#%>% na.omit()

#################################################################################

hematological_data_only <- hematological_data %>% 
  dplyr::select(subjid, hg2, lymph2, neutrop2, platelet2, wbc2,) %>% 
  column_to_rownames(var = "subjid")
  
hematological_data_only_log <- log(na.omit(hematological_data_only)) %>% as.matrix()

hematological_data_only_log_scaled <- mdatools::prep.autoscale(hematological_data_only_log,                                                           center = TRUE, scale = TRUE)


hematological_data_only_log_scaled_df <- as.data.frame(hematological_data_only_log_scaled) %>% 
  rownames_to_column(var = "subjid")

hematological_data_HR <- hematological_data %>% 
  dplyr::select(!c(hg2, lymph2, neutrop2, platelet2, wbc2,))


hematological_data_HR$subjid <- as.character(hematological_data_HR$subjid)

hematological_data_HR <-hematological_data_only_log_scaled_df   %>% 
  dplyr::left_join(., hematological_data_HR,
                   join_by(subjid))

hematological_data_HR <- hematological_data_HR %>% 
  dplyr::mutate(Neutr_Lymph_ratio=neutrop2/lymph2)

############################################

hematological_data_HR <- hematological_data_HR %>% 
  dplyr::rename(Haemoglobin = hg2,Lymphocytes = lymph2, 
                Neutrophils = neutrop2, Platelets = platelet2,
                WBCs = wbc2, NLR = Neutr_Lymph_ratio)


protein_cols <- hematological_data_HR %>% 
  dplyr::select(Haemoglobin, Lymphocytes, Neutrophils, Platelets, WBCs, NLR)



protein_data <- hematological_data_HR


adjusted_results <- list()
 
for (col in colnames(protein_cols)) {
  protein = protein_data[col]
  adjusted_results[[col]] <- coxph(Surv(time2_death, case_recoded) ~
                         protein[[col]] + agemonth + muac0 +
                           strata(sex, site, arm), data = protein_data)
  
  adjusted_results[[col]] <- tidy(adjusted_results[[col]])

}
## Warning in Surv(m1 + stemp * delta, y[, 1] + stemp * delta, y[, 2]): Stop time
## must be > start time, NA created
## Warning in Surv(m1 + stemp * delta, y[, 1] + stemp * delta, y[, 2]): Stop time
## must be > start time, NA created
## Warning in Surv(m1 + stemp * delta, y[, 1] + stemp * delta, y[, 2]): Stop time
## must be > start time, NA created
## Warning in Surv(m1 + stemp * delta, y[, 1] + stemp * delta, y[, 2]): Stop time
## must be > start time, NA created
## Warning in Surv(m1 + stemp * delta, y[, 1] + stemp * delta, y[, 2]): Stop time
## must be > start time, NA created
## Warning in Surv(m1 + stemp * delta, y[, 1] + stemp * delta, y[, 2]): Stop time
## must be > start time, NA created
adjusted_summary_df <- do.call(rbind,adjusted_results)

adjusted_summary_df <- adjusted_summary_df %>%
  dplyr::filter(term=='protein[[col]]') %>% 
  rownames_to_column(var = "cytokines") %>%
  dplyr::select(-term)

adjusted_summary_df$estimate <- as.numeric(adjusted_summary_df$estimate)

adjusted_summary_df$std.error <- as.numeric(adjusted_summary_df$std.error)


adjusted_summary_df$p.value <- as.numeric(adjusted_summary_df$p.value)

final_results_adjusted <- adjusted_summary_df %>% 

  mutate(

    HR = exp(estimate),

    Lower = exp(estimate - 1.96 * std.error),

    Upper = exp(estimate + 1.96 * std.error)

  ) 

final_results_adjusted$significance <- ifelse(final_results_adjusted$p.value <= 0.05, "Yes", "No")


final_results_adjusted$cytokines <- sub(".1$", "", final_results_adjusted$cytokines)

write_csv(final_results_adjusted,"../Processed_data/conditionalHR_adjusted_M2_haematology.csv")

** MUAC unadjusted Hematological factors - Month2

baseline_time2death <- read_csv("../Processed_data/baseline_time2death.csv")
## Rows: 128 Columns: 127
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (26): case, site, cerebral, diarrho, epilepsy, iconsc, pneumon, prematu...
## dbl  (85): subjid, time, agemonth, ehg, elymph, muac0, eneutrop, eplatelet, ...
## dttm (14): dateadmin, dateenrd, discdate, datem1, datem2, datem3, datem4, da...
## date  (2): dateenrd_T2E, enddate_T2E
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
baseline_hema <- baseline_time2death %>%
  dplyr::mutate(case_recoded = case_when(case=="Case"~1,case=="Control"~0)) 

hematological_data <- baseline_hema %>% 
  dplyr::select(c(subjid, agemonth, sex, site, arm, case,muac0,hg2, lymph2, neutrop2, platelet2, wbc2, discdate, dateenrd_T2E,datem2, enddate_T2E, case_recoded))

hematological_data$time2_death <- as.numeric(difftime(hematological_data$dateenrd_T2E, hematological_data$datem2, units = "days"))
#%>% na.omit()

#################################################################################

hematological_data_only <- hematological_data %>% 
  dplyr::select(subjid, hg2, lymph2, neutrop2, platelet2, wbc2,) %>% 
  column_to_rownames(var = "subjid")
  
hematological_data_only_log <- log(na.omit(hematological_data_only)) %>% as.matrix()

hematological_data_only_log_scaled <- mdatools::prep.autoscale(hematological_data_only_log,                                                           center = TRUE, scale = TRUE)


hematological_data_only_log_scaled_df <- as.data.frame(hematological_data_only_log_scaled) %>% 
  rownames_to_column(var = "subjid")

hematological_data_HR <- hematological_data %>% 
  dplyr::select(!c(hg2, lymph2, neutrop2, platelet2, wbc2,))


hematological_data_HR$subjid <- as.character(hematological_data_HR$subjid)

hematological_data_HR <-hematological_data_only_log_scaled_df   %>% 
  dplyr::left_join(., hematological_data_HR,
                   join_by(subjid))

hematological_data_HR <- hematological_data_HR %>% 
  dplyr::mutate(Neutr_Lymph_ratio=neutrop2/lymph2)

############################################

hematological_data_HR <- hematological_data_HR %>% 
  dplyr::rename(Haemoglobin = hg2,Lymphocytes = lymph2, 
                Neutrophils = neutrop2, Platelets = platelet2,
                WBCs = wbc2, NLR = Neutr_Lymph_ratio)


protein_cols <- hematological_data_HR %>% 
  dplyr::select(Haemoglobin, Lymphocytes, Neutrophils, Platelets, WBCs, NLR)



protein_data <- hematological_data_HR


adjusted_results <- list()
 
for (col in colnames(protein_cols)) {
  protein = protein_data[col]
  adjusted_results[[col]] <- coxph(Surv(time2_death, case_recoded) ~
                         protein[[col]] + agemonth + 
                           strata(sex, site, arm), data = protein_data)
  
  adjusted_results[[col]] <- tidy(adjusted_results[[col]])

}
## Warning in Surv(m1 + stemp * delta, y[, 1] + stemp * delta, y[, 2]): Stop time
## must be > start time, NA created
## Warning in Surv(m1 + stemp * delta, y[, 1] + stemp * delta, y[, 2]): Stop time
## must be > start time, NA created
## Warning in Surv(m1 + stemp * delta, y[, 1] + stemp * delta, y[, 2]): Stop time
## must be > start time, NA created
## Warning in Surv(m1 + stemp * delta, y[, 1] + stemp * delta, y[, 2]): Stop time
## must be > start time, NA created
## Warning in Surv(m1 + stemp * delta, y[, 1] + stemp * delta, y[, 2]): Stop time
## must be > start time, NA created
## Warning in Surv(m1 + stemp * delta, y[, 1] + stemp * delta, y[, 2]): Stop time
## must be > start time, NA created
adjusted_summary_df <- do.call(rbind,adjusted_results)

adjusted_summary_df <- adjusted_summary_df %>%
  dplyr::filter(term=='protein[[col]]') %>% 
  rownames_to_column(var = "cytokines") %>%
  dplyr::select(-term)

adjusted_summary_df$estimate <- as.numeric(adjusted_summary_df$estimate)

adjusted_summary_df$std.error <- as.numeric(adjusted_summary_df$std.error)


adjusted_summary_df$p.value <- as.numeric(adjusted_summary_df$p.value)

final_results_adjusted <- adjusted_summary_df %>% 

  mutate(

    HR = exp(estimate),

    Lower = exp(estimate - 1.96 * std.error),

    Upper = exp(estimate + 1.96 * std.error)

  ) 

final_results_adjusted$significance <- ifelse(final_results_adjusted$p.value <= 0.05, "Yes", "No")


final_results_adjusted$cytokines <- sub(".1$", "", final_results_adjusted$cytokines)

write_csv(final_results_adjusted,"../Processed_data/UnadjustedMUAC_HR/conditionalHR_M2_haematology.csv")

Survival analysis at Enrolment

set.seed(10)
proteomics_file <- read_csv("../Raw_data/LPDM_Protomics_24092024.csv")
## New names:
## Rows: 191 Columns: 613
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): timepoint, group dbl (611): ...1, record_id, A0A1Q1PRB1, A0A024CIM4,
## B7Z1N6, A0A024QZL1, A0A3...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
proteomics_file_E <- proteomics_file %>% dplyr::filter(timepoint=="E")

##Case (60) Control (32)
proteomics_file_E <- proteomics_file_E %>% dplyr::select(-c(timepoint,group,...1))
  
proteomics_file_E <- proteomics_file_E %>% 
  dplyr::select(!contains(c("REV__","CON__")))
proteingroup <- read.delim("../Raw_data/proteinGroups_LPDM_15112021.txt")

protein_group_genenames <- proteingroup %>% dplyr::select(Protein.IDs,Gene.names)

protein_group_genenames$Protein.IDs <- gsub(";.*", "", protein_group_genenames$Protein.IDs)

protein_group_genenames$Gene.names <- gsub(";.*", "", protein_group_genenames$Gene.names)

protein_group_genenames$Gene.names <- na_if(protein_group_genenames$Gene.names, '')

protein_group_genenames <- protein_group_genenames %>% dplyr::mutate(Gene.names=coalesce(protein_group_genenames$Gene.names,
                                  protein_group_genenames$Protein.IDs))
df2_split <- protein_group_genenames

#proteomics_file_m2 <- proteomics_file_m2 %>% column_to_rownames(var="record_id")

proteomics_file_uniprotID <- data.frame(Protein.IDs=colnames(proteomics_file_E)) %>%
  dplyr::filter(Protein.IDs!="record_id")
                                        

proteomics_file_uniprotID <- proteomics_file_uniprotID %>% left_join(.,protein_group_genenames,join_by(Protein.IDs))


#colnames(proteomics_file_m2) <- proteomics_file_uniprotID$Gene.names

#proteomics_file_m2 <- proteomics_file_m2 %>% rownames_to_column(var="record_id")

metadata <- read_xlsx("../Raw_data/CTX_case_control_variables_for_James10082023.xlsx")


proteingroup_metadata <- proteomics_file_E %>% 
  dplyr::left_join(.,metadata, join_by(record_id==subjid))


## Subset month Enrolment data
proteingroup_metadata_E <- proteingroup_metadata

#write_csv(proteingroup_metadata_E,"proteingroup_metadata_E_cleanJN.csv")

 ## Filtering 80% missing values
 
 ##Filtering missing values. Proteins missing in 80% of the samples were filtered out. 
 
#proteingroup_metadata_E <- read_csv("proteingroup_metadata_E_cleanJN.csv")

## Filtering missing data with EdgeR
proteingroup_metadata_E <- proteingroup_metadata_E %>% 
  dplyr::slice(-49)

proteingroup_metadata_E <- proteingroup_metadata_E %>%
  column_to_rownames(var = "record_id")

proteingroup_metadata_E_intensities <- proteingroup_metadata_E %>% 
  dplyr::select(A0A1Q1PRB1:V9HWI6)


edgeR_object <- edgeR::DGEList(counts = as.matrix(t(proteingroup_metadata_E_intensities)),samples = proteingroup_metadata_E,group = proteingroup_metadata_E$case,remove.zeros = T)
## Removing 10 rows with all zero counts
keep <- filterByExpr(edgeR_object,min.count = 0.1, 
                   min.total.count = 0.1, 
                   large.n = 10, min.prop = 0.8)
edgeR_object<-edgeR_object[keep,]


filtered_proteomics_data <- as.data.frame(t(edgeR_object[["counts"]]))


filtered_sample_data <- as.data.frame(edgeR_object[["samples"]])

# filtered_sample_data <- filtered_sample_data %>% 
#   dplyr::slice(-49)
# 
# filtered_proteomics_data <- filtered_proteomics_data %>% 
#   dplyr::slice(-49)

#rownames(filtered_proteomics_data) <- filtered_sample_data$record_id

## Imputation

 ### Replacing 0 with NA

filtered_proteomics_data[filtered_proteomics_data==0] <- NA


## Imputation using KNN

imp_knn = impute.knn(as.matrix(filtered_proteomics_data,
                     k = 10, rowmax = 0.5, colmax = 0.8,
                     maxp = 1500, rng.seed=362436069))
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 6 rows with more than 50 % entries missing;
##  mean imputation used for these rows
### Accessing the data
filtered_proteomics_data_knn <- as.data.frame(imp_knn$data) 


## Normalization and replacing outliers
 ### Log normalization
protein_data_arranged_E_log <- log(na.omit(filtered_proteomics_data_knn))

protein_data_arranged_E_log_matrix <- as.matrix(protein_data_arranged_E_log)

protein_data_arranged_E_log_auto_scale <- mdatools::prep.autoscale(protein_data_arranged_E_log_matrix,                                                           center = TRUE, scale = TRUE)


protein_data_arranged_E_log_auto_scale <- as.data.frame(protein_data_arranged_E_log_auto_scale)

 ## Replacing outliers 

capOutlier <- function(x){

  qnt <- quantile(x, probs=c(.25, .75), na.rm = T)

  caps <- quantile(x, probs=c(.1, .90), na.rm = T)

  H <- 1.5 * IQR(x, na.rm = T)

  x[x < (qnt[1] - H)] <- caps[1]

  x[x > (qnt[2] + H)] <- caps[2]

  return(x)

}

protein_data_arranged_E_log_auto_scale_Outliers_removed <- protein_data_arranged_E_log_auto_scale %>% mutate_at(c(1:423), capOutlier)

protein_data_arranged_E_log_auto_scale_Outliers_removed <- 
  protein_data_arranged_E_log_auto_scale_Outliers_removed %>% 
  rownames_to_column(var = "subjid")

write_csv(protein_data_arranged_E_log_auto_scale_Outliers_removed,"../Processed_data/protein_data_arranged_E_log_auto_scale_Outliers_removed_clean.csv")

** Proteomics MUAC adjusted

proteomics_enrol <- read_csv("../Processed_data/protein_data_arranged_E_log_auto_scale_Outliers_removed_clean.csv")
## Rows: 91 Columns: 424
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (424): subjid, A0A1Q1PRB1, A0A024CIM4, A0A024QZL1, A0A384NYN5, A0A024R03...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
baseline_time2death <- read_csv("../Processed_data/baseline_time2death.csv")
## Rows: 128 Columns: 127
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (26): case, site, cerebral, diarrho, epilepsy, iconsc, pneumon, prematu...
## dbl  (85): subjid, time, agemonth, ehg, elymph, muac0, eneutrop, eplatelet, ...
## dttm (14): dateadmin, dateenrd, discdate, datem1, datem2, datem3, datem4, da...
## date  (2): dateenrd_T2E, enddate_T2E
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
baseline <- baseline_time2death %>% 
  dplyr::select(subjid, agemonth, sex, site, arm, case, muac0, discdate, 
                dateenrd_T2E, datem2, enddate_T2E)


baseline <- baseline %>% 
  dplyr::mutate(case_recoded = case_when(case == "Case" ~ 1,
                                         case == "Control" ~ 0))

proteomics_HR <- proteomics_enrol %>% dplyr::left_join(., baseline, join_by(subjid))

proteomics_HR$time2_death <- as.numeric(difftime(proteomics_HR$datem2, 
                    proteomics_HR$discdate, units = "days"))

#write_csv(proteomics_HR,"../Processed_data/Model_data/proteomics_enrol.csv")

protein_cols <- proteomics_HR %>% 
  dplyr::select(A0A1Q1PRB1:V9HWI6)

protein_data <- proteomics_HR
 

proteingroup <- read.delim("../Raw_data/proteinGroups_LPDM_15112021.txt")

protein_group_genenames <- proteingroup %>% dplyr::select(Protein.IDs,Gene.names)

protein_group_genenames$Protein.IDs <- gsub(";.*", "", protein_group_genenames$Protein.IDs)

protein_group_genenames$Gene.names <- gsub(";.*", "", protein_group_genenames$Gene.names)

protein_group_genenames$Gene.names <- na_if(protein_group_genenames$Gene.names, '')

protein_group_genenames <- protein_group_genenames %>% dplyr::mutate(Gene.names=coalesce(protein_group_genenames$Gene.names,
                                  protein_group_genenames$Protein.IDs))
df2_split <- protein_group_genenames

# Initialize lists to store results


adjusted_summaryProt <- list()

adjusted_results <- list()

# protein_data$case_recoded <- as.factor(protein_data$case_recoded)
 
for (col in colnames(protein_cols)) {
  protein = protein_data[col]
  adjusted_results[[col]] <- coxph(Surv(time2_death, case_recoded) ~
                         protein[[col]] + agemonth + muac0 +
                           strata(sex, site, arm), data = protein_data)
  
  adjusted_summaryProt[[col]] <- tidy(adjusted_results[[col]])

}
 
adjusted_summary_dfProt <- do.call(rbind,adjusted_summaryProt )

adjusted_summary_dfProt <- adjusted_summary_dfProt %>% 
  rownames_to_column(var = "cytokines") # %>% 
  # slice(-c(1:5)) %>% 
  # dplyr::select(-term)

adjusted_summary_dfProt <- adjusted_summary_dfProt %>% 
  dplyr::filter(term %in% "protein[[col]]") %>% 
  dplyr::select(-term)

adjusted_summary_dfProt$cytokines <- gsub(".1$", "", adjusted_summary_dfProt$cytokines)
 
adjusted_summary_dfProt$estimate <- as.numeric(adjusted_summary_dfProt$estimate)

adjusted_summary_dfProt$std.error <- as.numeric(adjusted_summary_dfProt$std.error)


adjusted_summary_dfProt$p.value <- as.numeric(adjusted_summary_dfProt$p.value)

final_results_adjusted <- adjusted_summary_dfProt %>% 

  mutate(

    HR = exp(estimate),

    Lower = exp(estimate - 1.96 * std.error),

    Upper = exp(estimate + 1.96 * std.error)

  ) 


final_results_adjusted$significance <- ifelse(final_results_adjusted$p.value <= 0.05, "Yes", "No")
 
final_results_adjusted <- final_results_adjusted %>% 
  dplyr::left_join(., df2_split, join_by(cytokines == Protein.IDs))

final_results_adjusted <- final_results_adjusted %>% 
  dplyr::relocate(Gene.names, .after = cytokines)

write_csv(final_results_adjusted,"../Processed_data/conditionalHR_adjusted_proteins_enrol.csv")

** MUAC unadjusted Proteomics

proteomics_enrol <- read_csv("../Processed_data/protein_data_arranged_E_log_auto_scale_Outliers_removed_clean.csv")
## Rows: 91 Columns: 424
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (424): subjid, A0A1Q1PRB1, A0A024CIM4, A0A024QZL1, A0A384NYN5, A0A024R03...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
baseline_time2death <- read_csv("../Processed_data/baseline_time2death.csv")
## Rows: 128 Columns: 127
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (26): case, site, cerebral, diarrho, epilepsy, iconsc, pneumon, prematu...
## dbl  (85): subjid, time, agemonth, ehg, elymph, muac0, eneutrop, eplatelet, ...
## dttm (14): dateadmin, dateenrd, discdate, datem1, datem2, datem3, datem4, da...
## date  (2): dateenrd_T2E, enddate_T2E
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
baseline <- baseline_time2death %>% 
  dplyr::select(subjid, agemonth, sex, site, arm, case, muac0, discdate, 
                dateenrd_T2E, datem2, enddate_T2E)


baseline <- baseline %>% 
  dplyr::mutate(case_recoded = case_when(case == "Case" ~ 1,
                                         case == "Control" ~ 0))

proteomics_HR <- proteomics_enrol %>% dplyr::left_join(., baseline, join_by(subjid))

proteomics_HR$time2_death <- as.numeric(difftime(proteomics_HR$datem2, 
                    proteomics_HR$discdate, units = "days"))

#write_csv(proteomics_HR,"../Processed_data/Model_data/proteomics_enrol.csv")

protein_cols <- proteomics_HR %>% 
  dplyr::select(A0A1Q1PRB1:V9HWI6)

protein_data <- proteomics_HR
 

proteingroup <- read.delim("../Raw_data/proteinGroups_LPDM_15112021.txt")

protein_group_genenames <- proteingroup %>% dplyr::select(Protein.IDs,Gene.names)

protein_group_genenames$Protein.IDs <- gsub(";.*", "", protein_group_genenames$Protein.IDs)

protein_group_genenames$Gene.names <- gsub(";.*", "", protein_group_genenames$Gene.names)

protein_group_genenames$Gene.names <- na_if(protein_group_genenames$Gene.names, '')

protein_group_genenames <- protein_group_genenames %>% dplyr::mutate(Gene.names=coalesce(protein_group_genenames$Gene.names,
                                  protein_group_genenames$Protein.IDs))
df2_split <- protein_group_genenames

# Initialize lists to store results


adjusted_summaryProt <- list()

adjusted_results <- list()

# protein_data$case_recoded <- as.factor(protein_data$case_recoded)
 
for (col in colnames(protein_cols)) {
  protein = protein_data[col]
  adjusted_results[[col]] <- coxph(Surv(time2_death, case_recoded) ~
                         protein[[col]] + agemonth +
                           strata(sex, site, arm), data = protein_data)
  
  adjusted_summaryProt[[col]] <- tidy(adjusted_results[[col]])

}
 
adjusted_summary_dfProt <- do.call(rbind,adjusted_summaryProt )

adjusted_summary_dfProt <- adjusted_summary_dfProt %>% 
  rownames_to_column(var = "cytokines") # %>% 
  # slice(-c(1:5)) %>% 
  # dplyr::select(-term)

adjusted_summary_dfProt <- adjusted_summary_dfProt %>% 
  dplyr::filter(term %in% "protein[[col]]") %>% 
  dplyr::select(-term)

adjusted_summary_dfProt$cytokines <- gsub(".1$", "", adjusted_summary_dfProt$cytokines)
 
adjusted_summary_dfProt$estimate <- as.numeric(adjusted_summary_dfProt$estimate)

adjusted_summary_dfProt$std.error <- as.numeric(adjusted_summary_dfProt$std.error)


adjusted_summary_dfProt$p.value <- as.numeric(adjusted_summary_dfProt$p.value)

final_results_adjusted <- adjusted_summary_dfProt %>% 

  mutate(

    HR = exp(estimate),

    Lower = exp(estimate - 1.96 * std.error),

    Upper = exp(estimate + 1.96 * std.error)

  ) 


final_results_adjusted$significance <- ifelse(final_results_adjusted$p.value <= 0.05, "Yes", "No")
 
final_results_adjusted <- final_results_adjusted %>% 
  dplyr::left_join(., df2_split, join_by(cytokines == Protein.IDs))

final_results_adjusted <- final_results_adjusted %>% 
  dplyr::relocate(Gene.names, .after = cytokines)

write_csv(final_results_adjusted,"../Processed_data/UnadjustedMUAC_HR/conditionalHR_proteins_enrolment.csv")

** 4plex MUAC adjusted

allplates_4plex <- read_csv("/Users/bkamau/Desktop/Preprocessed/clean_files/combinedplates_4plex.csv")
## New names:
## Rows: 208 Columns: 9
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): subjid, timepoint, Thrombomodulin/BDCA-3, Angiopoietin-2, D-dimer,... dbl
## (1): ...1 date (1): date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
allplates_4plex_Enr <- allplates_4plex %>% 
  dplyr::filter(timepoint=="Enr")

allplates_4plex_Enr <- allplates_4plex_Enr %>% 
  dplyr::select(-c(date,timepoint,plate,...1))

df <- allplates_4plex_Enr
# Sample data frame with 20 columns and 100 rows
set.seed(123)  # For reproducibility

# Loop through each column in the data frame
for (col in names(df)) {
  # Convert column to character to handle "<" and ">" values properly
  df[[col]] <- as.character(df[[col]])
  
  # Calculate the minimum and maximum numeric values in the column, ignoring "<" and ">" values
  min_value <- min(as.numeric(df[[col]][!grepl("^[<>]", df[[col]])]), na.rm = TRUE)
  max_value <- max(as.numeric(df[[col]][!grepl("^[<>]", df[[col]])]), na.rm = TRUE)
  
  # Calculate the replacement value for "<" as minimum value divided by the square root of 2
  replacement_min <- min_value / sqrt(2)
  
  # Replace values that start with '<' with the calculated replacement minimum value
  df[[col]] <- ifelse(grepl("^<", df[[col]]), replacement_min, df[[col]])
  
  # Replace values that start with '>' with the maximum value in the column
  df[[col]] <- ifelse(grepl("^>", df[[col]]), max_value, df[[col]])
}
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
# Convert columns back to their appropriate data types (e.g., numeric)
df[] <- lapply(df, function(x) as.numeric(as.character(x)))
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
allplates_4plex_Enr <- df


# Assuming your data frame is named 'df' and the participant ID column is named 'ParticipantID'
allplates_4plex_Enr$subjid <- as.character(allplates_4plex_Enr$subjid)

allplates_4plex_Enr_clean <- allplates_4plex_Enr %>% slice(-c(146))
# Process the data
allplates_4plex_Enr_clean <- allplates_4plex_Enr_clean %>%
  group_by(subjid) %>% # Group by the participant identifier
  slice_max(rowSums(across(where(is.numeric))), n = 1, with_ties = FALSE) %>% # Select the row with the highest sum of protein values
  ungroup() # Ungroup the data frame
 
# View the result
print(allplates_4plex_Enr_clean)
## # A tibble: 114 × 5
##    subjid `Thrombomodulin/BDCA-3` `Angiopoietin-2` `D-dimer` ADAMTS13
##    <chr>                    <dbl>            <dbl>     <dbl>    <dbl>
##  1 103                       4264             8021       529      497
##  2 1041                      2536             1387      1331      203
##  3 1109                      3071             2222       480      217
##  4 1143                      2908             3848      3164      845
##  5 1164                      4808             9059      2979      748
##  6 1185                      3356             1525     10258      691
##  7 1212                      1109             1994      5125     1783
##  8 1214                      3159             3329     10258      229
##  9 1270                      1483             1554      5588      198
## 10 1281                      4848             2783      1715      724
## # ℹ 104 more rows
#write_csv(allplates_4plex_Enr_clean,"CTX_LPDM_251024/rawdata_4plex_enrol.csv")


## Log normalization and Autoscaling

allplates_4plex_Enr_clean <- allplates_4plex_Enr_clean %>%
   column_to_rownames(var = "subjid")

allplates_4plex_Enr_clean_log <- log(na.omit(allplates_4plex_Enr_clean))

allplates_4plex_Enr_clean_log_auto_scale <- mdatools::prep.autoscale(as.matrix(allplates_4plex_Enr_clean_log), 
center = TRUE, scale = TRUE)


allplates_4plex_Enr_clean_log_auto_scale <- as.data.frame(allplates_4plex_Enr_clean_log_auto_scale)


### Outlier replacement

capOutlier <- function(x){

  qnt <- quantile(x, probs=c(.25, .75), na.rm = T)

  caps <- quantile(x, probs=c(.1, .90), na.rm = T)

  H <- 1.5 * IQR(x, na.rm = T)

  x[x < (qnt[1] - H)] <- caps[1]

  x[x > (qnt[2] + H)] <- caps[2]

  return(x)

}

allplates_4plex_Enr_clean_log_auto_scale_long_Outliers_removed <- allplates_4plex_Enr_clean_log_auto_scale %>% mutate_at(c(1:4), capOutlier)


## Combine with baseline data

metadata <- read_xlsx("../Raw_data/CTX_case_control_variables_for_James10082023.xlsx")

allplates_4plex_Enr_clean_log_auto_scale <- allplates_4plex_Enr_clean_log_auto_scale %>% 
   rownames_to_column(var = "subjid")

metadata$subjid <- as.character(metadata$subjid)

cytokines4plex_metadata <- allplates_4plex_Enr_clean_log_auto_scale %>%
  dplyr::left_join(.,metadata, join_by(subjid))


#case <- luminex_4plex_cleaned_numeric_log_auto_scale_long_Outliers_removed$case
cytokines4plex_metadata <- cytokines4plex_metadata %>% 
  dplyr::filter(!is.na(case))

cytokines4plex_metadata <- cytokines4plex_metadata %>% 
  dplyr::rename(Thrombomodulin=`Thrombomodulin/BDCA-3`,
                Angiopoietin=`Angiopoietin-2`)

write_csv(cytokines4plex_metadata,"../Processed_data/luminex_4plex_cleaned_numeric_log_auto_scale_long_Outliers_removedEnrolment.csv")

** 4plex MUAC adjusted

cytokines4plex_metadata <- read_csv("../Processed_data/luminex_4plex_cleaned_numeric_log_auto_scale_long_Outliers_removedEnrolment.csv")
## Rows: 112 Columns: 126
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (23): case, site, cerebral, diarrho, epilepsy, iconsc, pneumon, prematu...
## dbl  (89): subjid, Thrombomodulin, Angiopoietin, D-dimer, ADAMTS13, time, ag...
## dttm (14): dateadmin, dateenrd, discdate, datem1, datem2, datem3, datem4, da...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cytokines4plex_metadata <- cytokines4plex_metadata %>% 
  dplyr::select(1:5)

baseline_time2death <- read_csv("../Processed_data/baseline_time2death.csv")
## Rows: 128 Columns: 127
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (26): case, site, cerebral, diarrho, epilepsy, iconsc, pneumon, prematu...
## dbl  (85): subjid, time, agemonth, ehg, elymph, muac0, eneutrop, eplatelet, ...
## dttm (14): dateadmin, dateenrd, discdate, datem1, datem2, datem3, datem4, da...
## date  (2): dateenrd_T2E, enddate_T2E
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
baseline_time2death <- baseline_time2death %>% 
  dplyr::select(subjid, agemonth, sex, site, arm, case, muac0, discdate, 
                dateenrd_T2E, datem2, enddate_T2E)

baseline_time2death <- baseline_time2death %>% 
  dplyr::mutate(case_recoded = case_when(case == "Case" ~ 1,
                                         case == "Control" ~ 0))

baseline_time2death$subjid <- as.character(baseline_time2death$subjid)

cytokines4plex_metadata$subjid <- as.character(cytokines4plex_metadata$subjid)

cytokines4plex_metadata_HR <- cytokines4plex_metadata %>% dplyr::left_join(., baseline_time2death, join_by(subjid))

cytokines4plex_metadata_HR$time2_death <- as.numeric(difftime(cytokines4plex_metadata_HR$datem2, cytokines4plex_metadata_HR$discdate, units = "days"))


protein_cols <- cytokines4plex_metadata_HR %>% 
  dplyr::select(Thrombomodulin:ADAMTS13)

protein_data <- cytokines4plex_metadata_HR
 
# Initialize lists to store results

#crude_results <- list()

adjusted_results <- list()
 
for (col in colnames(protein_cols)) {
  protein = protein_data[col]
  adjusted_results[[col]] <- coxph(Surv(time2_death, case_recoded) ~
                         protein[[col]] + agemonth + muac0 +
                           strata(sex, site, arm), data = protein_data)
  
  adjusted_results[[col]] <- tidy(adjusted_results[[col]])

}
 
adjusted_summary_df <- do.call(rbind,adjusted_results)

crude_summary_df <- adjusted_summary_df %>%
  dplyr::filter(term=='protein[[col]]') %>% 
  rownames_to_column(var = "cytokines") %>%
  dplyr::select(-term)

crude_summary_df$estimate <- as.numeric(crude_summary_df$estimate)

crude_summary_df$std.error <- as.numeric(crude_summary_df$std.error)


crude_summary_df$p.value <- as.numeric(crude_summary_df$p.value)

final_results_crude <- crude_summary_df %>% 

  mutate(

    HR = exp(estimate),

    Lower = exp(estimate - 1.96 * std.error),

    Upper = exp(estimate + 1.96 * std.error)

  ) 

final_results_crude$significance <- ifelse(final_results_crude$p.value <= 0.05, "Yes", "No")
 final_results_crude$cytokines <- sub(".1$", "", final_results_crude$cytokines)

write_csv(final_results_crude,"../Processed_data/conditionalHR_adjusted_enrol_4plex.csv")

** MUAC unadjusted - 4plex

cytokines4plex_metadata <- read_csv("../Processed_data/luminex_4plex_cleaned_numeric_log_auto_scale_long_Outliers_removedEnrolment.csv")
## Rows: 112 Columns: 126
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (23): case, site, cerebral, diarrho, epilepsy, iconsc, pneumon, prematu...
## dbl  (89): subjid, Thrombomodulin, Angiopoietin, D-dimer, ADAMTS13, time, ag...
## dttm (14): dateadmin, dateenrd, discdate, datem1, datem2, datem3, datem4, da...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cytokines4plex_metadata <- cytokines4plex_metadata %>% 
  dplyr::select(1:5)

baseline_time2death <- read_csv("../Processed_data/baseline_time2death.csv")
## Rows: 128 Columns: 127
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (26): case, site, cerebral, diarrho, epilepsy, iconsc, pneumon, prematu...
## dbl  (85): subjid, time, agemonth, ehg, elymph, muac0, eneutrop, eplatelet, ...
## dttm (14): dateadmin, dateenrd, discdate, datem1, datem2, datem3, datem4, da...
## date  (2): dateenrd_T2E, enddate_T2E
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
baseline_time2death <- baseline_time2death %>% 
  dplyr::select(subjid, agemonth, sex, site, arm, case, muac0, discdate, 
                dateenrd_T2E, datem2, enddate_T2E)

baseline_time2death <- baseline_time2death %>% 
  dplyr::mutate(case_recoded = case_when(case == "Case" ~ 1,
                                         case == "Control" ~ 0))

baseline_time2death$subjid <- as.character(baseline_time2death$subjid)

cytokines4plex_metadata$subjid <- as.character(cytokines4plex_metadata$subjid)

cytokines4plex_metadata_HR <- cytokines4plex_metadata %>% dplyr::left_join(., baseline_time2death, join_by(subjid))

cytokines4plex_metadata_HR$time2_death <- as.numeric(difftime(cytokines4plex_metadata_HR$datem2, cytokines4plex_metadata_HR$discdate, units = "days"))


protein_cols <- cytokines4plex_metadata_HR %>% 
  dplyr::select(Thrombomodulin:ADAMTS13)

protein_data <- cytokines4plex_metadata_HR
 
# Initialize lists to store results

#crude_results <- list()

adjusted_results <- list()
 
for (col in colnames(protein_cols)) {
  protein = protein_data[col]
  adjusted_results[[col]] <- coxph(Surv(time2_death, case_recoded) ~
                         protein[[col]] + agemonth +
                           strata(sex, site, arm), data = protein_data)
  
  adjusted_results[[col]] <- tidy(adjusted_results[[col]])

}
 
adjusted_summary_df <- do.call(rbind,adjusted_results)

crude_summary_df <- adjusted_summary_df %>%
  dplyr::filter(term=='protein[[col]]') %>% 
  rownames_to_column(var = "cytokines") %>%
  dplyr::select(-term)

crude_summary_df$estimate <- as.numeric(crude_summary_df$estimate)

crude_summary_df$std.error <- as.numeric(crude_summary_df$std.error)


crude_summary_df$p.value <- as.numeric(crude_summary_df$p.value)

final_results_crude <- crude_summary_df %>% 

  mutate(

    HR = exp(estimate),

    Lower = exp(estimate - 1.96 * std.error),

    Upper = exp(estimate + 1.96 * std.error)

  ) 

final_results_crude$significance <- ifelse(final_results_crude$p.value <= 0.05, "Yes", "No")
 final_results_crude$cytokines <- sub(".1$", "", final_results_crude$cytokines)

write_csv(final_results_crude,"../Processed_data/UnadjustedMUAC_HR/conditionalHR_enrolment_4plex.csv")

** 29plex Enrolment

allplates_29plex <- read_csv("/Users/bkamau/Desktop/Preprocessed/clean_files/combinedplates_29plex.csv")
## New names:
## Rows: 165 Columns: 35
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (25): subjid, timepoint, IFNg, IL-12p40, MIP-1a, IL-12p70, G-CSF, VEGF,... dbl
## (8): ...1, IL10, MIP-1b, Eotaxin, TNFa, IL-8, MCP-1, IP-10 lgl (1): RANTES date
## (1): date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
allplates_29plex_Enr <- allplates_29plex %>% 
  dplyr::filter(timepoint=="Enr")

allplates_29plex_Enr <- allplates_29plex_Enr %>% 
  dplyr::select(-c(date,timepoint,plate,...1))

df <- allplates_29plex_Enr
# Sample data frame with 20 columns and 100 rows
set.seed(123)  # For reproducibility

# Loop through each column in the data frame
for (col in names(df)) {
  # Convert column to character to handle "<" and ">" values properly
  df[[col]] <- as.character(df[[col]])
  
  # Calculate the minimum and maximum numeric values in the column, ignoring "<" and ">" values
  min_value <- min(as.numeric(df[[col]][!grepl("^[<>]", df[[col]])]), na.rm = TRUE)
  max_value <- max(as.numeric(df[[col]][!grepl("^[<>]", df[[col]])]), na.rm = TRUE)
  
  # Calculate the replacement value for "<" as minimum value divided by the square root of 2
  replacement_min <- min_value / sqrt(2)
  
  # Replace values that start with '<' with the calculated replacement minimum value
  df[[col]] <- ifelse(grepl("^<", df[[col]]), replacement_min, df[[col]])
  
  # Replace values that start with '>' with the maximum value in the column
  df[[col]] <- ifelse(grepl("^>", df[[col]]), max_value, df[[col]])
}
## Warning in min(as.numeric(df[[col]][!grepl("^[<>]", df[[col]])]), na.rm =
## TRUE): no non-missing arguments to min; returning Inf
## Warning in max(as.numeric(df[[col]][!grepl("^[<>]", df[[col]])]), na.rm =
## TRUE): no non-missing arguments to max; returning -Inf
## Warning in min(as.numeric(df[[col]][!grepl("^[<>]", df[[col]])]), na.rm =
## TRUE): no non-missing arguments to min; returning Inf
## Warning in max(as.numeric(df[[col]][!grepl("^[<>]", df[[col]])]), na.rm =
## TRUE): no non-missing arguments to max; returning -Inf
# Convert columns back to their appropriate data types (e.g., numeric)
df[] <- lapply(df, function(x) as.numeric(as.character(x)))


allplates_29plex_Enr <- df


# Assuming your data frame is named 'df' and the participant ID column is named 'ParticipantID'
allplates_29plex_Enr$subjid <- as.character(allplates_29plex_Enr$subjid)

allplates_29plex_Enr_clean <- allplates_29plex_Enr %>% 
  dplyr::select(-c(RANTES))
# Process the data
allplates_29plex_Enr_clean <- allplates_29plex_Enr_clean %>%
  group_by(subjid) %>% # Group by the participant identifier
  slice_max(rowSums(across(where(is.numeric))), n = 1, with_ties = FALSE) %>% # Select the row with the highest sum of protein values
  ungroup() # Ungroup the data frame
 
# View the result
print(allplates_29plex_Enr_clean)
## # A tibble: 112 × 30
##    subjid  IL10  IFNg `IL-12p40` `MIP-1a` `IL-12p70` `MIP-1b` `G-CSF`  VEGF
##    <chr>  <dbl> <dbl>      <dbl>    <dbl>      <dbl>    <dbl>   <dbl> <dbl>
##  1 103    73.3   8.85      0.81      7.04       2.06     66.4   74.5  228  
##  2 1041    8.13 73.4       2.83      1.60       1.41     10.6   38.9   15.9
##  3 1109   12.2  14.9      26.8      33.1        4.69     47.5   89.4   58.2
##  4 1143    7.15  6.31     14.7      15.7        5.39     39.0   37.2  428  
##  5 1185   12.5   6.77      0.573    14.1        2.44    138     19.2   69.9
##  6 1212    9.58  3.96    102         9.7        7.24     79.0   42.4  167  
##  7 1214   16.0   7.37     10.7      30.4        5.17     94.5   55.0  110  
##  8 1270   13.8   2.59      0.573     1.60       1.41     88.7    3.46  34.6
##  9 1281   11.6   8.03     26.5      22.0        5.17     54.8   32.0   89.1
## 10 130     9.27  2.37      0.573     1.60       1.41     94.5   48.6   69.2
## # ℹ 102 more rows
## # ℹ 21 more variables: Eotaxin <dbl>, TNFb <dbl>, TNFa <dbl>, IFNa2 <dbl>,
## #   `GM-CSF` <dbl>, `IL-5` <dbl>, `IL-7` <dbl>, `IL-1RA` <dbl>, `IL-8` <dbl>,
## #   `IL-2` <dbl>, `IL-4` <dbl>, `IL-6` <dbl>, `IL-3` <dbl>, `MCP-1` <dbl>,
## #   `IL-15` <dbl>, `IL-13` <dbl>, `IL-17` <dbl>, `IL-1b` <dbl>, `IL-1a` <dbl>,
## #   `IP-10` <dbl>, EGF <dbl>
write_csv(allplates_29plex_Enr_clean,"../Raw_data/rawdata_29plex_enrol.csv")

 allplates_29plex_Enr_clean <- allplates_29plex_Enr_clean %>%
    column_to_rownames(var = "subjid")

allplates_29plex_Enr_clean_log <- log(na.omit(allplates_29plex_Enr_clean))

allplates_29plex_Enr_clean_log_auto_scale <- mdatools::prep.autoscale(as.matrix(allplates_29plex_Enr_clean_log), 
center = TRUE, scale = TRUE)


allplates_29plex_Enr_clean_log_auto_scale <- as.data.frame(allplates_29plex_Enr_clean_log_auto_scale)


### Outlier replacement

capOutlier <- function(x){

  qnt <- quantile(x, probs=c(.25, .75), na.rm = T)

  caps <- quantile(x, probs=c(.1, .90), na.rm = T)

  H <- 1.5 * IQR(x, na.rm = T)

  x[x < (qnt[1] - H)] <- caps[1]

  x[x > (qnt[2] + H)] <- caps[2]

  return(x)

}

allplates_29plex_Enr_clean_log_auto_scale_long_Outliers_removed <- allplates_29plex_Enr_clean_log_auto_scale %>% mutate_at(c(1:29), capOutlier)


## Combine with baseline data

metadata <- read_xlsx("../Raw_data/CTX_case_control_variables_for_James10082023.xlsx")

allplates_29plex_Enr_clean_log_auto_scale <- allplates_29plex_Enr_clean_log_auto_scale %>% 
   rownames_to_column(var = "subjid")

metadata$subjid <- as.character(metadata$subjid)

cytokines29plex_metadata <- allplates_29plex_Enr_clean_log_auto_scale %>%
  dplyr::left_join(.,metadata, join_by(subjid))


#case <- luminex_29plex_cleaned_numeric_log_auto_scale_long_Outliers_removed$case
cytokines29plex_metadata <- cytokines29plex_metadata %>% 
  dplyr::filter(!is.na(case))

cytokines29plex_metadata <- cytokines29plex_metadata %>% 
  dplyr::select(-c(`IL-3`))

write_csv(cytokines29plex_metadata,"../Processed_data/luminex_29plex_cleaned_numeric_log_auto_scale_long_Outliers_removedEnrolment.csv")

** 29plex Enrolment MUAC adjusted

cytokines29plex_metadata <- read_csv("../Processed_data/luminex_29plex_cleaned_numeric_log_auto_scale_long_Outliers_removedEnrolment.csv")
## Rows: 110 Columns: 150
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (23): case, site, cerebral, diarrho, epilepsy, iconsc, pneumon, premat...
## dbl  (113): subjid, IL10, IFNg, IL-12p40, MIP-1a, IL-12p70, MIP-1b, G-CSF, V...
## dttm  (14): dateadmin, dateenrd, discdate, datem1, datem2, datem3, datem4, d...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
baseline_time2death <- read_csv("../Processed_data/baseline_time2death.csv")
## Rows: 128 Columns: 127
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (26): case, site, cerebral, diarrho, epilepsy, iconsc, pneumon, prematu...
## dbl  (85): subjid, time, agemonth, ehg, elymph, muac0, eneutrop, eplatelet, ...
## dttm (14): dateadmin, dateenrd, discdate, datem1, datem2, datem3, datem4, da...
## date  (2): dateenrd_T2E, enddate_T2E
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cytokines29plex_metadata <- cytokines29plex_metadata %>% 
  dplyr::select(1:29)


baseline_time2death <- baseline_time2death %>% 
  dplyr::select(subjid, agemonth, sex, site, arm, case, muac0, discdate, 
                dateenrd_T2E, datem2, enddate_T2E)

baseline_time2death <- baseline_time2death %>% 
  dplyr::mutate(case_recoded = case_when(case == "Case" ~ 1,
                                         case == "Control" ~ 0))

baseline_time2death$subjid <- as.character(baseline_time2death$subjid)

cytokines29plex_metadata$subjid <- as.character(cytokines29plex_metadata$subjid)

cytokines29plex_metadata_HR <- cytokines29plex_metadata %>% dplyr::left_join(., baseline_time2death, join_by(subjid))

cytokines29plex_metadata_HR$time2_death <- as.numeric(difftime(cytokines29plex_metadata_HR$datem2, cytokines29plex_metadata_HR$discdate, units = "days"))

cytokines29plex_metadata_HR
## # A tibble: 110 × 41
##    subjid    IL10    IFNg `IL-12p40` `MIP-1a` `IL-12p70` `MIP-1b` `G-CSF`
##    <chr>    <dbl>   <dbl>      <dbl>    <dbl>      <dbl>    <dbl>   <dbl>
##  1 103     2.06   -0.0595    -0.615   -0.138      -0.409  -0.162   0.590 
##  2 1041   -0.741   2.15       0.0964  -1.00       -0.928  -2.83   -0.0422
##  3 1109   -0.229   0.486      1.38     0.763       0.711  -0.648   0.767 
##  4 1143   -0.904  -0.412      1.04     0.328       0.900  -0.935  -0.0847
##  5 1185   -0.193  -0.339     -0.813    0.268      -0.179   0.903  -0.730 
##  6 1212   -0.532  -0.898      2.14     0.0490      1.30    0.0925  0.0409
##  7 1214    0.122  -0.250      0.855    0.715       0.844   0.353   0.295 
##  8 1270   -0.0710 -1.34      -0.813   -1.00       -0.928   0.261  -2.40  
##  9 1281   -0.283  -0.161      1.37     0.527       0.844  -0.440  -0.230 
## 10 130    -0.574  -1.43      -0.813   -1.00       -0.928   0.352   0.176 
## # ℹ 100 more rows
## # ℹ 33 more variables: VEGF <dbl>, Eotaxin <dbl>, TNFb <dbl>, TNFa <dbl>,
## #   IFNa2 <dbl>, `GM-CSF` <dbl>, `IL-5` <dbl>, `IL-7` <dbl>, `IL-1RA` <dbl>,
## #   `IL-8` <dbl>, `IL-2` <dbl>, `IL-4` <dbl>, `IL-6` <dbl>, `MCP-1` <dbl>,
## #   `IL-15` <dbl>, `IL-13` <dbl>, `IL-17` <dbl>, `IL-1b` <dbl>, `IL-1a` <dbl>,
## #   `IP-10` <dbl>, EGF <dbl>, agemonth <dbl>, sex <chr>, site <chr>, arm <chr>,
## #   case <chr>, muac0 <dbl>, discdate <dttm>, dateenrd_T2E <date>, …
write_csv(cytokines29plex_metadata_HR,"../Processed_data/Model_data/4plex_enrol.csv")

protein_cols <- cytokines29plex_metadata_HR %>% 
  dplyr::select(IL10:EGF)

protein_data <- cytokines29plex_metadata_HR
 
# Initialize lists to store results

adjusted_results <- list()
 
for (col in colnames(protein_cols)) {
  protein = protein_data[col]
  adjusted_results[[col]] <- coxph(Surv(time2_death, case_recoded) ~
                         protein[[col]] + agemonth + muac0 +
                           strata(sex, site, arm), data = protein_data)
  
  adjusted_results[[col]] <- tidy(adjusted_results[[col]])

}
 
adjusted_summary_df <- do.call(rbind,adjusted_results)

crude_summary_df <- adjusted_summary_df %>%
  dplyr::filter(term=='protein[[col]]') %>% 
  rownames_to_column(var = "cytokines") %>% 
  dplyr::select(-term)

crude_summary_df$estimate <- as.numeric(crude_summary_df$estimate)

crude_summary_df$std.error <- as.numeric(crude_summary_df$std.error)


crude_summary_df$p.value <- as.numeric(crude_summary_df$p.value)

final_results_crude <- crude_summary_df %>% 

  mutate(

    HR = exp(estimate),

    Lower = exp(estimate - 1.96 * std.error),

    Upper = exp(estimate + 1.96 * std.error)

  ) 

#final_results_crude$p.adjusted <- stats::p.adjust(final_results_crude$p.value,method = "fdr")

final_results_crude$significance <- ifelse(final_results_crude$p.value <= 0.05, "Yes", "No")
 

final_results_crude$cytokines <- sub(".1$", "", final_results_crude$cytokines)

final_results_crude <- final_results_crude %>%
  dplyr::mutate(cytokines = case_when(
    cytokines == "IFNa2" ~ "IFNα2",
    cytokines == "IL-1a" ~ "IL-1α",
    cytokines == "IL-1b" ~ "IL-1β",
    cytokines == "TNFa" ~ "TNFα",
    cytokines == "MIP-1a" ~ "MIP-1α",
    cytokines == "TNFb" ~ "TNFβ",
    cytokines == "MIP-1b" ~ "MIP-1β",
    cytokines == "IFNg" ~ "IFN-γ",
    cytokines == "IL10" ~ "IL-10",
    TRUE ~ cytokines
  ))
 

write_csv(final_results_crude,"../Processed_data/conditionalHR_adjusted_enrol_29plex.csv")

** MUAC unadjusted 29plex Enrolment

cytokines29plex_metadata <- read_csv("../Processed_data/luminex_29plex_cleaned_numeric_log_auto_scale_long_Outliers_removedEnrolment.csv")
## Rows: 110 Columns: 150
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (23): case, site, cerebral, diarrho, epilepsy, iconsc, pneumon, premat...
## dbl  (113): subjid, IL10, IFNg, IL-12p40, MIP-1a, IL-12p70, MIP-1b, G-CSF, V...
## dttm  (14): dateadmin, dateenrd, discdate, datem1, datem2, datem3, datem4, d...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
baseline_time2death <- read_csv("../Processed_data/baseline_time2death.csv")
## Rows: 128 Columns: 127
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (26): case, site, cerebral, diarrho, epilepsy, iconsc, pneumon, prematu...
## dbl  (85): subjid, time, agemonth, ehg, elymph, muac0, eneutrop, eplatelet, ...
## dttm (14): dateadmin, dateenrd, discdate, datem1, datem2, datem3, datem4, da...
## date  (2): dateenrd_T2E, enddate_T2E
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cytokines29plex_metadata <- cytokines29plex_metadata %>% 
  dplyr::select(1:29)


baseline_time2death <- baseline_time2death %>% 
  dplyr::select(subjid, agemonth, sex, site, arm, case, muac0, discdate, 
                dateenrd_T2E, datem2, enddate_T2E)

baseline_time2death <- baseline_time2death %>% 
  dplyr::mutate(case_recoded = case_when(case == "Case" ~ 1,
                                         case == "Control" ~ 0))

baseline_time2death$subjid <- as.character(baseline_time2death$subjid)

cytokines29plex_metadata$subjid <- as.character(cytokines29plex_metadata$subjid)

cytokines29plex_metadata_HR <- cytokines29plex_metadata %>% dplyr::left_join(., baseline_time2death, join_by(subjid))

cytokines29plex_metadata_HR$time2_death <- as.numeric(difftime(cytokines29plex_metadata_HR$datem2, cytokines29plex_metadata_HR$discdate, units = "days"))

cytokines29plex_metadata_HR
## # A tibble: 110 × 41
##    subjid    IL10    IFNg `IL-12p40` `MIP-1a` `IL-12p70` `MIP-1b` `G-CSF`
##    <chr>    <dbl>   <dbl>      <dbl>    <dbl>      <dbl>    <dbl>   <dbl>
##  1 103     2.06   -0.0595    -0.615   -0.138      -0.409  -0.162   0.590 
##  2 1041   -0.741   2.15       0.0964  -1.00       -0.928  -2.83   -0.0422
##  3 1109   -0.229   0.486      1.38     0.763       0.711  -0.648   0.767 
##  4 1143   -0.904  -0.412      1.04     0.328       0.900  -0.935  -0.0847
##  5 1185   -0.193  -0.339     -0.813    0.268      -0.179   0.903  -0.730 
##  6 1212   -0.532  -0.898      2.14     0.0490      1.30    0.0925  0.0409
##  7 1214    0.122  -0.250      0.855    0.715       0.844   0.353   0.295 
##  8 1270   -0.0710 -1.34      -0.813   -1.00       -0.928   0.261  -2.40  
##  9 1281   -0.283  -0.161      1.37     0.527       0.844  -0.440  -0.230 
## 10 130    -0.574  -1.43      -0.813   -1.00       -0.928   0.352   0.176 
## # ℹ 100 more rows
## # ℹ 33 more variables: VEGF <dbl>, Eotaxin <dbl>, TNFb <dbl>, TNFa <dbl>,
## #   IFNa2 <dbl>, `GM-CSF` <dbl>, `IL-5` <dbl>, `IL-7` <dbl>, `IL-1RA` <dbl>,
## #   `IL-8` <dbl>, `IL-2` <dbl>, `IL-4` <dbl>, `IL-6` <dbl>, `MCP-1` <dbl>,
## #   `IL-15` <dbl>, `IL-13` <dbl>, `IL-17` <dbl>, `IL-1b` <dbl>, `IL-1a` <dbl>,
## #   `IP-10` <dbl>, EGF <dbl>, agemonth <dbl>, sex <chr>, site <chr>, arm <chr>,
## #   case <chr>, muac0 <dbl>, discdate <dttm>, dateenrd_T2E <date>, …
write_csv(cytokines29plex_metadata_HR,"../Processed_data/Model_data/4plex_enrol.csv")

protein_cols <- cytokines29plex_metadata_HR %>% 
  dplyr::select(IL10:EGF)

protein_data <- cytokines29plex_metadata_HR
 
# Initialize lists to store results

adjusted_results <- list()
 
for (col in colnames(protein_cols)) {
  protein = protein_data[col]
  adjusted_results[[col]] <- coxph(Surv(time2_death, case_recoded) ~
                         protein[[col]] + agemonth +
                           strata(sex, site, arm), data = protein_data)
  
  adjusted_results[[col]] <- tidy(adjusted_results[[col]])

}
 
adjusted_summary_df <- do.call(rbind,adjusted_results)

crude_summary_df <- adjusted_summary_df %>%
  dplyr::filter(term=='protein[[col]]') %>% 
  rownames_to_column(var = "cytokines") %>% 
  dplyr::select(-term)

crude_summary_df$estimate <- as.numeric(crude_summary_df$estimate)

crude_summary_df$std.error <- as.numeric(crude_summary_df$std.error)


crude_summary_df$p.value <- as.numeric(crude_summary_df$p.value)

final_results_crude <- crude_summary_df %>% 

  mutate(

    HR = exp(estimate),

    Lower = exp(estimate - 1.96 * std.error),

    Upper = exp(estimate + 1.96 * std.error)

  ) 

#final_results_crude$p.adjusted <- stats::p.adjust(final_results_crude$p.value,method = "fdr")

final_results_crude$significance <- ifelse(final_results_crude$p.value <= 0.05, "Yes", "No")
 

final_results_crude$cytokines <- sub(".1$", "", final_results_crude$cytokines)

final_results_crude <- final_results_crude %>%
  dplyr::mutate(cytokines = case_when(
    cytokines == "IFNa2" ~ "IFNα2",
    cytokines == "IL-1a" ~ "IL-1α",
    cytokines == "IL-1b" ~ "IL-1β",
    cytokines == "TNFa" ~ "TNFα",
    cytokines == "MIP-1a" ~ "MIP-1α",
    cytokines == "TNFb" ~ "TNFβ",
    cytokines == "MIP-1b" ~ "MIP-1β",
    cytokines == "IFNg" ~ "IFN-γ",
    cytokines == "IL10" ~ "IL-10",
    TRUE ~ cytokines
  ))
 

write_csv(final_results_crude,"../Processed_data/UnadjustedMUAC_HR/conditionalHR_enrolment_29plex.csv")

** Hematological factors MUAC adjusted - enrolment

baseline_time2death <- read_csv("../Processed_data/baseline_time2death.csv")
## Rows: 128 Columns: 127
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (26): case, site, cerebral, diarrho, epilepsy, iconsc, pneumon, prematu...
## dbl  (85): subjid, time, agemonth, ehg, elymph, muac0, eneutrop, eplatelet, ...
## dttm (14): dateadmin, dateenrd, discdate, datem1, datem2, datem3, datem4, da...
## date  (2): dateenrd_T2E, enddate_T2E
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
baseline_hema <- baseline_time2death %>%
  dplyr::mutate(case_recoded = case_when(case=="Case"~1,case=="Control"~0)) # baseline_time2death

hematological_data <- baseline_hema %>% 
  dplyr::select(c(subjid, agemonth, sex, site, arm, case,muac0,ehg,elymph,eneutrop,eplatelet,ewbc,discdate, dateenrd_T2E,datem2, enddate_T2E, case_recoded))

hematological_data$time2_death <- as.numeric(difftime(hematological_data$datem2, hematological_data$discdate, units = "days"))


### Haemoglobin Enrol

hematological_data_only <- hematological_data %>% 
  dplyr::select(subjid, ehg,elymph,eneutrop,eplatelet,ewbc) %>% 
  column_to_rownames(var = "subjid")
  
hematological_data_only_log <- log(na.omit(hematological_data_only)) %>% as.matrix()

hematological_data_only_log_scaled <- mdatools::prep.autoscale(hematological_data_only_log,                                                           center = TRUE, scale = TRUE)


hematological_data_only_log_scaled_df <- as.data.frame(hematological_data_only_log_scaled) %>% 
  rownames_to_column(var = "subjid")

hematological_data_HR <- hematological_data %>% 
  dplyr::select(!c(ehg,elymph,eneutrop,eplatelet,ewbc))


hematological_data_HR$subjid <- as.character(hematological_data_HR$subjid)

hematological_data_HR <-hematological_data_only_log_scaled_df   %>% 
  dplyr::left_join(., hematological_data_HR,
                   join_by(subjid))

hematological_data_HR <- hematological_data_HR %>% 
  dplyr::mutate(Neutr_Lymph_ratio=eneutrop/elymph)

############################################

hematological_data_HR <- hematological_data_HR %>% 
  dplyr::rename(Haemoglobin = ehg,Lymphocytes = elymph, 
                Neutrophils = eneutrop, Platelets = eplatelet,
                WBCs = ewbc, NLR = Neutr_Lymph_ratio)


protein_cols <- hematological_data_HR %>% 
  dplyr::select(Haemoglobin, Lymphocytes, Neutrophils, Platelets, WBCs, NLR)



protein_data <- hematological_data_HR


adjusted_results <- list()
 
for (col in colnames(protein_cols)) {
  protein = protein_data[col]
  adjusted_results[[col]] <- coxph(Surv(time2_death, case_recoded) ~
                         protein[[col]] + agemonth + muac0 +
                           strata(sex, site, arm), data = protein_data)
  
  adjusted_results[[col]] <- tidy(adjusted_results[[col]])

}
 
adjusted_summary_df <- do.call(rbind,adjusted_results)

adjusted_summary_df <- adjusted_summary_df %>%
  dplyr::filter(term=='protein[[col]]') %>% 
  rownames_to_column(var = "cytokines") %>%
  dplyr::select(-term)

adjusted_summary_df$estimate <- as.numeric(adjusted_summary_df$estimate)

adjusted_summary_df$std.error <- as.numeric(adjusted_summary_df$std.error)


adjusted_summary_df$p.value <- as.numeric(adjusted_summary_df$p.value)

final_results_adjusted <- adjusted_summary_df %>% 

  mutate(

    HR = exp(estimate),

    Lower = exp(estimate - 1.96 * std.error),

    Upper = exp(estimate + 1.96 * std.error)

  ) 

final_results_adjusted$significance <- ifelse(final_results_adjusted$p.value <= 0.05, "Yes", "No")


final_results_adjusted$cytokines <- sub(".1$", "", final_results_adjusted$cytokines)

write_csv(final_results_adjusted,"../Processed_data/conditionalHR_adjusted_enrol_haematology.csv")

** MUAC unadjusted Hematological factors - enrolment

baseline_time2death <- read_csv("../Processed_data/baseline_time2death.csv")
## Rows: 128 Columns: 127
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (26): case, site, cerebral, diarrho, epilepsy, iconsc, pneumon, prematu...
## dbl  (85): subjid, time, agemonth, ehg, elymph, muac0, eneutrop, eplatelet, ...
## dttm (14): dateadmin, dateenrd, discdate, datem1, datem2, datem3, datem4, da...
## date  (2): dateenrd_T2E, enddate_T2E
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
baseline_hema <- baseline_time2death %>%
  dplyr::mutate(case_recoded = case_when(case=="Case"~1,case=="Control"~0)) # baseline_time2death

hematological_data <- baseline_hema %>% 
  dplyr::select(c(subjid, agemonth, sex, site, arm, case,muac0,ehg,elymph,eneutrop,eplatelet,ewbc,discdate, dateenrd_T2E,datem2, enddate_T2E, case_recoded))

hematological_data$time2_death <- as.numeric(difftime(hematological_data$datem2, hematological_data$discdate, units = "days"))


### Haemoglobin Enrol

hematological_data_only <- hematological_data %>% 
  dplyr::select(subjid, ehg,elymph,eneutrop,eplatelet,ewbc) %>% 
  column_to_rownames(var = "subjid")
  
hematological_data_only_log <- log(na.omit(hematological_data_only)) %>% as.matrix()

hematological_data_only_log_scaled <- mdatools::prep.autoscale(hematological_data_only_log,                                                           center = TRUE, scale = TRUE)


hematological_data_only_log_scaled_df <- as.data.frame(hematological_data_only_log_scaled) %>% 
  rownames_to_column(var = "subjid")

hematological_data_HR <- hematological_data %>% 
  dplyr::select(!c(ehg,elymph,eneutrop,eplatelet,ewbc))


hematological_data_HR$subjid <- as.character(hematological_data_HR$subjid)

hematological_data_HR <-hematological_data_only_log_scaled_df   %>% 
  dplyr::left_join(., hematological_data_HR,
                   join_by(subjid))

hematological_data_HR <- hematological_data_HR %>% 
  dplyr::mutate(Neutr_Lymph_ratio=eneutrop/elymph)

############################################

hematological_data_HR <- hematological_data_HR %>% 
  dplyr::rename(Haemoglobin = ehg,Lymphocytes = elymph, 
                Neutrophils = eneutrop, Platelets = eplatelet,
                WBCs = ewbc, NLR = Neutr_Lymph_ratio)


protein_cols <- hematological_data_HR %>% 
  dplyr::select(Haemoglobin, Lymphocytes, Neutrophils, Platelets, WBCs, NLR)



protein_data <- hematological_data_HR


adjusted_results <- list()
 
for (col in colnames(protein_cols)) {
  protein = protein_data[col]
  adjusted_results[[col]] <- coxph(Surv(time2_death, case_recoded) ~
                         protein[[col]] + agemonth + 
                           strata(sex, site, arm), data = protein_data)
  
  adjusted_results[[col]] <- tidy(adjusted_results[[col]])

}
 
adjusted_summary_df <- do.call(rbind,adjusted_results)

adjusted_summary_df <- adjusted_summary_df %>%
  dplyr::filter(term=='protein[[col]]') %>% 
  rownames_to_column(var = "cytokines") %>%
  dplyr::select(-term)

adjusted_summary_df$estimate <- as.numeric(adjusted_summary_df$estimate)

adjusted_summary_df$std.error <- as.numeric(adjusted_summary_df$std.error)


adjusted_summary_df$p.value <- as.numeric(adjusted_summary_df$p.value)

final_results_adjusted <- adjusted_summary_df %>% 

  mutate(

    HR = exp(estimate),

    Lower = exp(estimate - 1.96 * std.error),

    Upper = exp(estimate + 1.96 * std.error)

  ) 

final_results_adjusted$significance <- ifelse(final_results_adjusted$p.value <= 0.05, "Yes", "No")


final_results_adjusted$cytokines <- sub(".1$", "", final_results_adjusted$cytokines)

write_csv(final_results_adjusted,"../Processed_data/UnadjustedMUAC_HR/conditionalHR_enrolment_haematology.csv")

*** Hazard ratios plots for Endothelial markers and cytokines (4plex and 29plex)

***** Month 2 MUAC adjusted

M24plex <- read_csv("../Processed_data/conditionalHR_adjusted_m2_4plex.csv")
## Rows: 4 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): cytokines, significance
## dbl (7): estimate, std.error, statistic, p.value, HR, Lower, Upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
M229plex <- read_csv("../Processed_data/conditionalHR_adjusted_m2_29plex.csv")
## Rows: 4 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): cytokines, significance
## dbl (7): estimate, std.error, statistic, p.value, HR, Lower, Upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
CombM2_4_29Plex <- bind_rows(M24plex, M229plex)

#CombM2_4_29Plex$cytokines <- gsub("-", "", CombM2_4_29Plex$cytokines)

CombM2_4_29Plex %>% dplyr::arrange(desc(estimate)) %>% 

  ggforestplot::forestplot(name = cytokines, estimate = estimate, 

                           se = std.error, logodds = T, 

                           boxsize = 0.2, 

                           colour = significance, 

                           xlab = "HR (95% CI)", 

                           ylab = "Cytokines and endothelial markers at Month 2",

                            title = paste0("Controls", "                    ", "     Cases"),

                           ) +

  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 15),

        axis.text.x = element_text(size = 15),

        axis.text.y = element_text(size = 15, face = "bold"),

        legend.position = "none",

        legend.title = element_text(size = 15, face = "bold"),

        legend.text = element_text(size = 15, face = "bold"),

        axis.title.x = element_text(size = 15, face = "bold", colour = "blue"),

        axis.title.y = element_text(size = 15, face = "bold", colour = "blue"))

#ggsave("../Figures/Hazard_ratios/Endothelialand cytokines_M2.jpeg", width = 6, height= 10, dpi = 600, units = "in")
#ggsave("../Figures/Hazard_ratios/Endothelialand cytokines_M2.pdf", width = 6, height= 10, dpi = 600, units = "in")

***** Enrolment

Enr4plex <- read_csv("../Processed_data/conditionalHR_adjusted_disc_4plex.csv")
## Rows: 4 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): cytokines, significance
## dbl (7): estimate, std.error, statistic, p.value, HR, Lower, Upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Enr29plex <- read_csv("../Processed_data/conditionalHR_adjusted_disc_29plex.csv")
## Rows: 28 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): cytokines, significance
## dbl (7): estimate, std.error, statistic, p.value, HR, Lower, Upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
CombEnrl4_29Plex <- bind_rows(Enr4plex, Enr29plex)

CombEnrl4_29Plex %>% dplyr::arrange(desc(estimate)) %>% 

  ggforestplot::forestplot(name = cytokines, estimate = estimate, 

                           se = std.error, logodds = T, 

                           boxsize = 0.2, 

                           colour = significance, 

                           xlab = "HR (95% CI)", 

                           ylab = "Cytokines and endothelial markers at Enrolment",

                            title = paste0("Controls", "                    ", "Cases"),

                           ) +

  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 15),

        axis.text.x = element_text(size = 15),

        axis.text.y = element_text(size = 15, face = "bold"),

        legend.position = "none",

        legend.title = element_text(size = 15, face = "bold"),#size = 15 for pdf

        legend.text = element_text(size = 15, face = "bold"),

        axis.title.x = element_text(size = 15, face = "bold", colour = "blue"),#size = 65 for jpeg

        axis.title.y = element_text(size = 15, face = "bold", colour = "blue"))

#ggsave("../Figures/Hazard_ratios/Endothelialand cytokines_enrolment.jpeg", width = 6, height= 10, dpi = 600, units = "in")
 
#ggsave("../Figures/Hazard_ratios/Endothelialand cytokines_enrolment.pdf", width = 6, height= 10, dpi = 600, units = "in")

** Hazard ratios plots for proteomics *** Enrolment

EnrProteomics <- read_csv("../Processed_data/conditionalHR_adjusted_proteins_disc.csv")
## Rows: 423 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): cytokines, Gene.names, significance
## dbl (7): estimate, std.error, statistic, p.value, HR, Lower, Upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#EnrProteomics$Gene.names <- gsub("-", "", EnrProteomics$Gene.names)
 
EnrProteomicsSign <- EnrProteomics %>% dplyr::filter(significance == "Yes")
 
EnrProteomicsSign %>% dplyr::arrange(desc(estimate)) %>% 

  ggforestplot::forestplot(name = Gene.names, estimate = estimate, 

                           se = std.error, logodds = T, 

                           boxsize = 0.2, 

                           colour = significance, 

                           xlab = "HR (95% CI)", 

                           ylab = "Proteins at Enrolment",

                            title = paste0("Controls   ", "               ", "Cases"),

                           ) +

theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 15),

        axis.text.x = element_text(size = 15),

        axis.text.y = element_text(size = 15, face = "bold"),

        legend.position = "none",

        legend.title = element_text(size = 15, face = "bold"),

        legend.text = element_text(size = 15, face = "bold"),

        axis.title.x = element_text(size = 15, face = "bold", colour = "blue"),

        axis.title.y = element_text(size = 15, face = "bold", colour = "blue"))

#ggsave("../Figures/Hazard_ratios/proteomics_enrol.jpeg", width = 5.5, height = 8, dpi = 600, units = "in")
#ggsave("../Figures/Hazard_ratios/proteomics_enrol.pdf", width = 5.5, height = 8, dpi = 600, units = "in")

*** Month 2

M2Proteomics <- read_csv("../Processed_data/conditionalHR_adjusted_proteinm2.csv")
## Rows: 374 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): cytokines, Gene.names, significance
## dbl (7): estimate, std.error, statistic, p.value, HR, Lower, Upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
M2ProteomicsSign <- M2Proteomics %>% dplyr::filter(significance == "Yes")
 
M2ProteomicsSign %>% dplyr::arrange(desc(estimate)) %>% 

  ggforestplot::forestplot(name = Gene.names, estimate = estimate, 

                           se = std.error, logodds = T, 

                           boxsize = 0.2, 

                           colour = significance, 

                           xlab = "HR (95% CI)", 

                           ylab = "Proteins at Month 2",

                            title = paste0("Controls", "                ", "Cases"),

                           ) +

theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 15),

        axis.text.x = element_text(size = 15),

        axis.text.y = element_text(size = 15, face = "bold"),

        legend.position = "none",

        legend.title = element_text(size = 15, face = "bold"),

        legend.text = element_text(size = 15, face = "bold"),

        axis.title.x = element_text(size = 15, face = "bold", colour = "blue"),

        axis.title.y = element_text(size = 15, face = "bold", colour = "blue"))

#ggsave("../Figures/Hazard_ratios/proteomics_M2.jpeg", width = 5, height = 6, dpi = 600, units = "in")
 
#ggsave("../Figures/Hazard_ratios/proteomics_M2.pdf", width = 5, height = 6, dpi = 600, units = "in") 

** Hazard ratios plots for Haematology *** Enrolment

EnrHaema <- read_csv("../Processed_data/conditionalHR_adjusted_enrol_haematology.csv")
## Rows: 6 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): cytokines, significance
## dbl (7): estimate, std.error, statistic, p.value, HR, Lower, Upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
EnrHaema %>% dplyr::arrange(desc(estimate)) %>% 

  ggforestplot::forestplot(name = cytokines, estimate = estimate, 

                           se = std.error, logodds = T, 

                           boxsize = 0.2, 

                           colour = significance, 

                           xlab = "HR (95% CI)", 

                           ylab = "Haematological parameters at Enrolment",

                            title = "Controls               Cases",

                           ) +

theme(plot.title = element_text(hjust = 0.85, face = "bold", size = 15),

        axis.text.x = element_text(size = 15),

        axis.text.y = element_text(size = 15, face = "bold"),

        legend.position = "none",

        legend.title = element_text(size = 15, face = "bold"),

        legend.text = element_text(size = 15, face = "bold"),

        axis.title.x = element_text(size = 15, face = "bold", colour = "blue"),

        axis.title.y = element_text(size = 15, face = "bold", colour = "blue"))

#ggsave("../Figures/Hazard_ratios/Haematological_enrol.jpeg", width = 6.5, height = 4.5, dpi = 600, units = "in")

#ggsave("../Figures/Hazard_ratios/Haematological_enrol.pdf", width = 6.5, height = 4.5, dpi = 600, units = "in") 

*** Month 2

M2Haema <- read_csv("../Processed_data/conditionalHR_adjusted_M2_haematology.csv")
## Rows: 6 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): cytokines, significance
## dbl (7): estimate, std.error, statistic, p.value, HR, Lower, Upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
M2Haema %>% dplyr::arrange(desc(estimate)) %>% 

  ggforestplot::forestplot(name = cytokines, estimate = estimate, 

                           se = std.error, logodds = T, 

                           boxsize = 0.2, 

                           colour = significance, 

                           xlab = "HR (95% CI)", 

                           ylab = "Haematological parameters at Month 2",

                            title = paste0("Controls", "              ", "Cases"),

                           ) +

theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 15),

        axis.text.x = element_text(size = 15),

        axis.text.y = element_text(size = 15, face = "bold"),

        legend.position = "none",

        legend.title = element_text(size = 15, face = "bold"),

        legend.text = element_text(size = 15, face = "bold"),

        axis.title.x = element_text(size = 15, face = "bold", colour = "blue"),

        axis.title.y = element_text(size = 15, face = "bold", colour = "blue"))

#ggsave("../Figures/Hazard_ratios/Haematological_M2.jpeg", width = 6.5, height = 4.5, dpi = 600, units = "in")
 
#ggsave("../Figures/Hazard_ratios/Haematological_M2.pdf", width = 6.5, height = 4.5, dpi = 600, units = "in")  

MUAC Unadjusted Hazard ratios

***** Month 2 MUAC

M24plex <- read_csv("../Processed_data/UnadjustedMUAC_HR/conditionalHR_m2_4plex.csv")
## Rows: 4 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): cytokines, significance
## dbl (7): estimate, std.error, statistic, p.value, HR, Lower, Upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
M229plex <- read_csv("../Processed_data/UnadjustedMUAC_HR/conditionalHR_m2_29plex.csv")
## Rows: 4 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): cytokines, significance
## dbl (7): estimate, std.error, statistic, p.value, HR, Lower, Upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
CombM2_4_29Plex <- bind_rows(M24plex, M229plex)

#CombM2_4_29Plex$cytokines <- gsub("-", "", CombM2_4_29Plex$cytokines)

CombM2_4_29Plex %>% dplyr::arrange(desc(estimate)) %>% 

  ggforestplot::forestplot(name = cytokines, estimate = estimate, 

                           se = std.error, logodds = T, 

                           boxsize = 0.2, 

                           colour = significance, 

                           xlab = "HR (95% CI)", 

                           ylab = "Cytokines and endothelial markers at Month 2",

                            title = paste0("Controls", "                    ", "     Cases"),

                           ) +

  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 15),

        axis.text.x = element_text(size = 15),

        axis.text.y = element_text(size = 15, face = "bold"),

        legend.position = "none",

        legend.title = element_text(size = 15, face = "bold"),

        legend.text = element_text(size = 15, face = "bold"),

        axis.title.x = element_text(size = 15, face = "bold", colour = "blue"),

        axis.title.y = element_text(size = 15, face = "bold", colour = "blue"))

#ggsave("../Figures/Hazard_ratios/UnadjustedMUAC/Endothelialand cytokines_M2.jpeg", width = 6, height= 10, dpi = 600, units = "in")
#ggsave("../Figures/Hazard_ratios/UnadjustedMUAC/Endothelialand cytokines_M2.pdf", width = 6, height= 10, dpi = 600, units = "in")

***** Enrolment

Enr4plex <- read_csv("../Processed_data/UnadjustedMUAC_HR/conditionalHR_enrolment_4plex.csv")
## Rows: 4 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): cytokines, significance
## dbl (7): estimate, std.error, statistic, p.value, HR, Lower, Upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Enr29plex <- read_csv("../Processed_data/UnadjustedMUAC_HR/conditionalHR_enrolment_29plex.csv")
## Rows: 28 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): cytokines, significance
## dbl (7): estimate, std.error, statistic, p.value, HR, Lower, Upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
CombEnrl4_29Plex <- bind_rows(Enr4plex, Enr29plex)

CombEnrl4_29Plex %>% dplyr::arrange(desc(estimate)) %>% 

  ggforestplot::forestplot(name = cytokines, estimate = estimate, 

                           se = std.error, logodds = T, 

                           boxsize = 0.2, 

                           colour = significance, 

                           xlab = "HR (95% CI)", 

                           ylab = "Cytokines and endothelial markers at Enrolment",

                            title = paste0("Controls", "                    ", "Cases"),

                           ) +

  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 15),

        axis.text.x = element_text(size = 15),

        axis.text.y = element_text(size = 15, face = "bold"),

        legend.position = "none",

        legend.title = element_text(size = 15, face = "bold"),#size = 15 for pdf

        legend.text = element_text(size = 15, face = "bold"),

        axis.title.x = element_text(size = 15, face = "bold", colour = "blue"),#size = 65 for jpeg

        axis.title.y = element_text(size = 15, face = "bold", colour = "blue"))

#ggsave("../Figures/Hazard_ratios/UnadjustedMUAC/Endothelialand cytokines_enrolment.jpeg", width = 6, height= 10, dpi = 600, units = "in")
 
#ggsave("../Figures/Hazard_ratios/UnadjustedMUAC/Endothelialand cytokines_enrolment.pdf", width = 6, height= 10, dpi = 600, units = "in")

** Hazard ratios plots for proteomics *** Enrolment

EnrProteomics <- read_csv("../Processed_data/UnadjustedMUAC_HR/conditionalHR_proteins_enrolment.csv")
## Rows: 423 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): cytokines, Gene.names, significance
## dbl (7): estimate, std.error, statistic, p.value, HR, Lower, Upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#EnrProteomics$Gene.names <- gsub("-", "", EnrProteomics$Gene.names)
 
EnrProteomicsSign <- EnrProteomics %>% dplyr::filter(significance == "Yes")
 
EnrProteomicsSign %>% dplyr::arrange(desc(estimate)) %>% 

  ggforestplot::forestplot(name = Gene.names, estimate = estimate, 

                           se = std.error, logodds = T, 

                           boxsize = 0.2, 

                           colour = significance, 

                           xlab = "HR (95% CI)", 

                           ylab = "Proteins at Enrolment",

                            title = paste0("Controls   ", "               ", "Cases"),

                           ) +

theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 15),

        axis.text.x = element_text(size = 15),

        axis.text.y = element_text(size = 15, face = "bold"),

        legend.position = "none",

        legend.title = element_text(size = 15, face = "bold"),

        legend.text = element_text(size = 15, face = "bold"),

        axis.title.x = element_text(size = 15, face = "bold", colour = "blue"),

        axis.title.y = element_text(size = 15, face = "bold", colour = "blue"))

#ggsave("../Figures/Hazard_ratios/UnadjustedMUAC/proteomics_enrol.jpeg", width = 5.5, height = 8, dpi = 600, units = "in")
#ggsave("../Figures/Hazard_ratios/UnadjustedMUAC/proteomics_enrol.pdf", width = 5.5, height = 8, dpi = 600, units = "in")

*** Month 2

M2Proteomics <- read_csv("../Processed_data/UnadjustedMUAC_HR/conditionalHR_proteinm2.csv")
## Rows: 374 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): cytokines, Gene.names, significance
## dbl (7): estimate, std.error, statistic, p.value, HR, Lower, Upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
M2ProteomicsSign <- M2Proteomics %>% dplyr::filter(significance == "Yes")
 
M2ProteomicsSign %>% dplyr::arrange(desc(estimate)) %>% 

  ggforestplot::forestplot(name = Gene.names, estimate = estimate, 

                           se = std.error, logodds = T, 

                           boxsize = 0.2, 

                           colour = significance, 

                           xlab = "HR (95% CI)", 

                           ylab = "Proteins at Month 2",

                            title = paste0("Controls", "                ", "Cases"),

                           ) +

theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 15),

        axis.text.x = element_text(size = 15),

        axis.text.y = element_text(size = 15, face = "bold"),

        legend.position = "none",

        legend.title = element_text(size = 15, face = "bold"),

        legend.text = element_text(size = 15, face = "bold"),

        axis.title.x = element_text(size = 15, face = "bold", colour = "blue"),

        axis.title.y = element_text(size = 15, face = "bold", colour = "blue"))

#ggsave("../Figures/Hazard_ratios/UnadjustedMUAC/proteomics_M2.jpeg", width = 5, height = 6, dpi = 600, units = "in")
 
#ggsave("../Figures/Hazard_ratios/UnadjustedMUAC/proteomics_M2.pdf", width = 5, height = 6, dpi = 600, units = "in") 

** Hazard ratios plots for Haematology *** Enrolment

EnrHaema <- read_csv("../Processed_data/UnadjustedMUAC_HR/conditionalHR_enrolment_haematology.csv")
## Rows: 6 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): cytokines, significance
## dbl (7): estimate, std.error, statistic, p.value, HR, Lower, Upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
EnrHaema %>% dplyr::arrange(desc(estimate)) %>% 

  ggforestplot::forestplot(name = cytokines, estimate = estimate, 

                           se = std.error, logodds = T, 

                           boxsize = 0.2, 

                           colour = significance, 

                           xlab = "HR (95% CI)", 

                           ylab = "Haematological parameters at Enrolment",

                            title = "Controls               Cases",

                           ) +

theme(plot.title = element_text(hjust = 0.85, face = "bold", size = 15),

        axis.text.x = element_text(size = 15),

        axis.text.y = element_text(size = 15, face = "bold"),

        legend.position = "none",

        legend.title = element_text(size = 15, face = "bold"),

        legend.text = element_text(size = 15, face = "bold"),

        axis.title.x = element_text(size = 15, face = "bold", colour = "blue"),

        axis.title.y = element_text(size = 15, face = "bold", colour = "blue"))

#ggsave("../Figures/Hazard_ratios/UnadjustedMUAC/Haematological_enrol.jpeg", width = 6.5, height = 4.5, dpi = 600, units = "in")

#ggsave("../Figures/Hazard_ratios/UnadjustedMUAC/Haematological_enrol.pdf", width = 6.5, height = 4.5, dpi = 600, units = "in") 

*** Month 2

M2Haema <- read_csv("../Processed_data/UnadjustedMUAC_HR/conditionalHR_M2_haematology.csv")
## Rows: 6 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): cytokines, significance
## dbl (7): estimate, std.error, statistic, p.value, HR, Lower, Upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
M2Haema %>% dplyr::arrange(desc(estimate)) %>% 

  ggforestplot::forestplot(name = cytokines, estimate = estimate, 

                           se = std.error, logodds = T, 

                           boxsize = 0.2, 

                           colour = significance, 

                           xlab = "HR (95% CI)", 

                           ylab = "Haematological parameters at Month 2",

                            title = paste0("Controls", "              ", "Cases"),

                           ) +

theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 15),

        axis.text.x = element_text(size = 15),

        axis.text.y = element_text(size = 15, face = "bold"),

        legend.position = "none",

        legend.title = element_text(size = 15, face = "bold"),

        legend.text = element_text(size = 15, face = "bold"),

        axis.title.x = element_text(size = 15, face = "bold", colour = "blue"),

        axis.title.y = element_text(size = 15, face = "bold", colour = "blue"))

#ggsave("../Figures/Hazard_ratios/UnadjustedMUAC/Haematological_M2.jpeg", width = 6.5, height = 4.5, dpi = 600, units = "in")
 
#ggsave("../Figures/Hazard_ratios/UnadjustedMUAC/Haematological_M2.pdf", width = 6.5, height = 4.5, dpi = 600, units = "in")  

Lineplot

baseline <- read_xlsx("../Raw_data/CTX_case_control_variables_for_James10082023.xlsx")

baseline_Table_enrol <- baseline %>% 
  dplyr::select(subjid, agemonth, sex, site, case, arm, pneumon, diarrho, cerebral, shock,ehg, elymph, eplatelet, ewbc, eneutrop, ends_with("0"))


baseline_Table_enrol <- baseline_Table_enrol %>% dplyr::select(!ends_with("10"))

baseline_Table_enrol <- baseline_Table_enrol %>% 
    dplyr::select(subjid,agemonth, sex, site, case, arm, muac0, zlen0, zwei0,
                  zwfl0,pneumon, diarrho, shock, cerebral,oedema0,
                  ehg,eplatelet, ewbc, elymph, eneutrop)


baseline_Table1 <- baseline %>% 
  dplyr::select(subjid, agemonth, sex, site, case, arm, pneumon, diarrho, cerebral, shock, ends_with("2"))


baseline_Table1 <- baseline_Table1 %>% dplyr::select(!ends_with("12"), -c(height2, weight2, datem2))

baseline_Table_M2 <- baseline_Table1 %>% 
  dplyr::select(subjid, agemonth, sex, site, case, arm, muac2, zlen2, zwei2, zwfl2, pneumon, diarrho, shock, cerebral,oedema2, hg2, platelet2, wbc2, lymph2, neutrop2)

base_enrol <- baseline_Table_enrol %>% dplyr::select(c(1,5,7:10,16:20))
base_enrol$Timepoint <- "Enrolment"


base_M2 <- baseline_Table_M2 %>% dplyr::select(c(1,5,7:10,16:20))
base_M2$Timepoint <- "Month 2"

base_enrol <- base_enrol %>% 
  pivot_longer(cols = -c(subjid,Timepoint, case),
               names_to = "haematology",
               values_to = "values")

base_M2 <- base_M2 %>% 
  pivot_longer(cols = -c(subjid,Timepoint,case),
               names_to = "haematology",
               values_to = "values")

combined_lineplot_long <- bind_rows(base_enrol,base_M2)

combined_lineplot_long <- combined_lineplot_long %>% 
  dplyr::mutate(haematology = 
                  case_when(haematology=="ehg"| haematology == "hg2" ~ "Haemoglobin",
                            haematology=="eplatelet" | haematology == "platelet2" ~"Platelets",
                            haematology=="ewbc" | haematology =="wbc2" ~ "WBCs",
                            haematology=="elymph" | haematology == "lymph2" ~ "Lymphocytes",
                            haematology=="eneutrop"| haematology == "neutrop2" ~ "Neutrophils", 
                            haematology=="muac0"| haematology == "muac2" ~ "MUAC", TRUE ~ haematology))

combined_lineplot_long <- combined_lineplot_long %>% 
  dplyr::mutate(case = 
                  case_when(case=="Case" ~ "Cases",
                            case=="Control" ~ "Controls"))

combined_lineplot_long_haem <- combined_lineplot_long %>% dplyr::filter(haematology %in% c("Haemoglobin",
                    "Lymphocytes","Neutrophils","Platelets","WBCs","MUAC"))

write_csv(combined_lineplot_long_haem, "../Processed_data/combined_lineplot_long_haematology.csv")

** Platelets

combined_lineplot_long_haem <- read_csv("../Processed_data/combined_lineplot_long_haematology.csv")
## Rows: 1536 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): case, Timepoint, haematology
## dbl (2): subjid, values
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
combined_lineplot_long_haem %>%
  dplyr::filter(haematology == "Platelets") %>%
  ggplot(aes(x = Timepoint, y = values)) +

  # Add boxplot
  geom_boxplot(alpha = 0.2, outlier.shape = NA, width = 0.3) +

    # Add individual subject lines and points
  geom_line(aes(group = subjid, color = case), size = 0.75, alpha = 0.5) +
  geom_smooth(aes(group=case), method = "loess", se = T, color = "red",alpha = 0.5, size = 1) +
  
  geom_point(aes(color = case), size = 0.7) +

  # Color scales
  ggsci::scale_color_nejm() +
  #ggsci::scale_fill_nejm() +

  # Facet by case
  facet_grid(~case, scales = "free_y") +

  # Add p-values
  stat_compare_means(
    comparisons = list(c("Enrolment", "Month 2")),
    method = "wilcox.test",
    paired = TRUE,
    label = "p.format",
    size = 2.8
  ) +

  # Labels and theme
  labs(
    y = "Platelets (10^3/µL)",
    x = "Timepoint",
    color = "Survival status",
    fill = "Survival status"
  ) +
  theme_pubr() +
  theme(
    axis.title = element_text(face = "bold"),
    strip.text = element_text(face = "bold"),
    axis.text = element_text(face = "bold", size = 10),
    legend.position = "none"
  )
## `geom_smooth()` using formula = 'y ~ x'

ggsave("../Figures/Lineplots/Platelets_lineplot.pdf", height = 3.75, width = 4)
## `geom_smooth()` using formula = 'y ~ x'

** Lymphocytes

combined_lineplot_long_haem <- read_csv("../Processed_data/combined_lineplot_long_haematology.csv")
## Rows: 1536 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): case, Timepoint, haematology
## dbl (2): subjid, values
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
combined_lineplot_long_haem %>%
  dplyr::filter(haematology == "Lymphocytes") %>%
  ggplot(aes(x = Timepoint, y = values)) +

  # Add boxplot
  geom_boxplot(alpha = 0.2, outlier.shape = NA, width = 0.3) +

  # Add individual subject lines and points
  geom_line(aes(group = subjid, color = case), size = 0.75, alpha = 0.5) +
  geom_smooth(aes(group=case), method = "loess", se = T, color = "red",alpha = 0.5, size = 1) +
  geom_point(aes(color = case), size = 0.7) +

  # Color scales
  ggsci::scale_color_nejm() +
  #ggsci::scale_fill_nejm() +

  # Facet by case
  facet_grid(~case, scales = "free_y") +

  # Add p-values
  stat_compare_means(
    comparisons = list(c("Enrolment", "Month 2")),
    method = "wilcox.test",
    paired = TRUE,
    label = "p.format",
    size = 2.8
  ) +

  # Labels and theme
  labs(
    y = "Lymphocytes (10^3/µL)",
    x = "Timepoint",
    color = "Survival status",
    fill = "Survival status"
  ) +
  theme_pubr() +
  theme(
    axis.title = element_text(face = "bold"),
    strip.text = element_text(face = "bold"),
    axis.text = element_text(face = "bold", size = 10),
    legend.position = "none"
  )
## `geom_smooth()` using formula = 'y ~ x'

ggsave("../Figures/Lineplots/Lymphocytes_lineplot.pdf", height = 3.75, width = 4)
## `geom_smooth()` using formula = 'y ~ x'

*** Neutrophils

combined_lineplot_long_haem <- read_csv("../Processed_data/combined_lineplot_long_haematology.csv")
## Rows: 1536 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): case, Timepoint, haematology
## dbl (2): subjid, values
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
combined_lineplot_long_haem %>%
  dplyr::filter(haematology == "Neutrophils" & subjid != 789) %>%
  ggplot(aes(x = Timepoint, y = values)) +

  # Add boxplot
  geom_boxplot(alpha = 0.2, outlier.shape = NA, width = 0.3) +

  # Add individual subject lines and points
  geom_line(aes(group = subjid, color = case), size = 0.75, alpha = 0.5) +
  geom_smooth(aes(group=case), method = "loess", se = T, color = "red",alpha = 0.5, size = 1) +
  geom_point(aes(color = case), size = 0.7) +

  # Color scales
  ggsci::scale_color_nejm() +
  #ggsci::scale_fill_nejm() +

  # Facet by case
  facet_grid(~case, scales = "free_y") +

  # Add p-values
  stat_compare_means(
    comparisons = list(c("Enrolment", "Month 2")),
    method = "wilcox.test",
    paired = TRUE,
    label = "p.format",
    size = 2.8
  ) +

  # Labels and theme
  labs(
    y = "Neutrophils (10^3/µL)",
    x = "Timepoint",
    color = "Survival status",
    fill = "Survival status"
  ) +
  theme_pubr() +
  theme(
    axis.title = element_text(face = "bold"),
    strip.text = element_text(face = "bold"),
    axis.text = element_text(face = "bold", size = 10),
    legend.position = "none"
  )
## `geom_smooth()` using formula = 'y ~ x'

ggsave("../Figures/Lineplots/Neutrophils_lineplot.pdf", height = 3.75, width = 4)
## `geom_smooth()` using formula = 'y ~ x'

*** WBCs

combined_lineplot_long_haem <- read_csv("../Processed_data/combined_lineplot_long_haematology.csv")
## Rows: 1536 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): case, Timepoint, haematology
## dbl (2): subjid, values
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
combined_lineplot_long_haem %>%
  dplyr::filter(haematology == "WBCs" & subjid != 789) %>%
  ggplot(aes(x = Timepoint, y = values)) +

  # Add boxplot
  geom_boxplot(alpha = 0.2, outlier.shape = NA, width = 0.3) +

  # Add individual subject lines and points
  geom_line(aes(group = subjid, color = case), size = 0.75, alpha = 0.5) +
  geom_smooth(aes(group=case), method = "loess", se = T, color = "red",alpha = 0.5, size = 1) +
  geom_point(aes(color = case), size = 0.7) +

  # Color scales
  ggsci::scale_color_nejm() +
  #ggsci::scale_fill_nejm() +

  # Facet by case
  facet_grid(~case, scales = "free_y") +

  # Add p-values
  stat_compare_means(
    comparisons = list(c("Enrolment", "Month 2")),
    method = "wilcox.test",
    paired = TRUE,
    label = "p.format",
    size = 2.8
  ) +

  # Labels and theme
  labs(
    y = "WBCs (10^3/µL)",
    x = "Timepoint",
    color = "Survival status",
    fill = "Survival status"
  ) +
  theme_pubr() +
  theme(
    axis.title = element_text(face = "bold"),
    strip.text = element_text(face = "bold"),
    axis.text = element_text(face = "bold", size = 10),
    legend.position = "none"
  )
## `geom_smooth()` using formula = 'y ~ x'

ggsave("../Figures/Lineplots/WBCs_lineplot.pdf", height = 3.75, width = 4)
## `geom_smooth()` using formula = 'y ~ x'

**Haemoglobin

combined_lineplot_long_haem <- read_csv("../Processed_data/combined_lineplot_long_haematology.csv")
## Rows: 1536 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): case, Timepoint, haematology
## dbl (2): subjid, values
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
combined_lineplot_long_haem_anaemia <- combined_lineplot_long_haem %>%
  dplyr::filter(haematology == "Haemoglobin" & subjid != 551) # 551 an outlier


combined_lineplot_long_haem_anaemia <- combined_lineplot_long_haem_anaemia %>% 
  dplyr::mutate(anaemia_status = case_when(values > 11 ~ "None",
                                           values < 7 ~ "Severe",
                                           values >= 10 & values <=11 ~ "Mild",
                                           values >= 7 & values < 10 ~ "Moderate"))


combined_lineplot_long_haem_anaemia$anaemia_status <- 
  factor(combined_lineplot_long_haem_anaemia$anaemia_status,
         levels = c("None","Mild", 
                    "Moderate", "Severe"))

group_stats <- combined_lineplot_long_haem_anaemia %>% 
  dplyr::filter(Timepoint=="Month 2" & case == "Cases") %>% 
  group_by(anaemia_status) %>% 
  summarise(n = sum(!is.na(anaemia_status)),
            prop = n/53*100) %>% 
  ungroup() %>% slice(-5)


summary_label <- paste0(
  "None (n=", group_stats$n[group_stats$anaemia_status == "None"], "):        ", round(group_stats$prop[1], 1),"%" , "\n",
  "Mild (n=", group_stats$n[group_stats$anaemia_status == "Mild"], "):            ", round(group_stats$prop[2], 1), "%" ,  "\n",
  "Moderate (n=", group_stats$n[group_stats$anaemia_status == "Moderate"], "): ", round(group_stats$prop[3], 1),"%", "\n", 
  "Severe (n=", group_stats$n[group_stats$anaemia_status == "Severe"], "):         ",
  round(group_stats$prop[4], 1), "%"
)

p <- ggplot(data = combined_lineplot_long_haem_anaemia, 
            aes(x = Timepoint, y = values)) +

  # Add boxplot
  geom_boxplot(alpha = 0.2, outlier.shape = NA, width = 0.3) +
  

  # Add individual subject lines and points
  geom_line(aes(group = subjid, color = case), size = 0.75, alpha = 0.5) +
  geom_smooth(aes(group=case), method = "loess", se = T, color = "red",alpha = 0.5, size = 1) +
  geom_point(aes(color = case), size = 0.7) +
  geom_hline(yintercept = 11,color="darkgreen", linewidth=0.7, linetype=2)+
  geom_hline(yintercept = 10,color="purple", linewidth=0.7, linetype=2)+
  geom_hline(yintercept = 7,color="brown", linewidth=0.7, linetype=2)+
  # Shaded regions
  annotate("rect", xmin = -Inf, xmax = Inf, ymin = 10, ymax = 11, fill = "grey80", alpha = 0.35) + 
  annotate("rect", xmin = -Inf, xmax = Inf, ymin = 7, ymax = 10, fill = "grey40", alpha = 0.3) +
  annotate("rect", xmin = -Inf, xmax = Inf, ymin = 4, ymax = 7, fill = "grey10", alpha = 0.3) +

 # geom_label(data = data.frame(
 #    case = "Cases",  # only show in this facet
 #    x = 1.85,
 #    y = 17.55,
 #    label = summary_label),
 #  aes(x = x, y = y, label = label),
 #  size = 3.25, hjust = 0, vjust = 1,
 #  fill = "white", color = "black", 
 #  label.size = 0.4, fontface = "plain",
 #  inherit.aes = FALSE) +
 # 
  # Color scales
  ggsci::scale_color_nejm() +
  #ggsci::scale_fill_nejm() +

  # Facet by case
  facet_grid(~case, scales = "free_y") +

  # Add p-values
  stat_compare_means(
    comparisons = list(c("Enrolment", "Month 2")),
    method = "wilcox.test",
    paired = TRUE,
    label = "p.format",
    size = 2.8
  ) +

  # Labels and theme
  labs(
    y = "Haemoglobin (g/dl)",
    x = "Timepoint",
    color = "Survival status",
    fill = "Survival status"
  ) +
  theme_pubr() +
   theme(
    axis.title = element_text(face = "bold", size = 10),
    strip.text = element_text(face = "bold", size = 10),
    axis.text = element_text(face = "bold", size = 10),
    legend.position = "none"
  )

p
## `geom_smooth()` using formula = 'y ~ x'

ggsave("../Figures/Lineplots/Haemoglobin_lineplot_Jul2025.pdf", 
       height = 3.75, width = 4)
## `geom_smooth()` using formula = 'y ~ x'
group_stats <- combined_lineplot_long_haem_anaemia %>% 
  dplyr::filter(Timepoint=="Enrolment" & case == "Cases") %>% 
  group_by(anaemia_status) %>% 
  summarise(n = sum(!is.na(anaemia_status)),
            prop = n/60*100) %>% 
  ungroup() %>% slice(-5)


summary_label <- paste0(
  "None (n=", group_stats$n[group_stats$anaemia_status == "None"], "):        ", round(group_stats$prop[1], 1),"%" , "\n",
  "Mild (n=", group_stats$n[group_stats$anaemia_status == "Mild"], "):          ", round(group_stats$prop[2], 1), "%" ,  "\n",
  "Moderate (n=", group_stats$n[group_stats$anaemia_status == "Moderate"], "): ", round(group_stats$prop[3], 1),"%", "\n", 
  "Severe (n=", group_stats$n[group_stats$anaemia_status == "Severe"], "):         ",
  round(group_stats$prop[4], 1), "%"
)

p1 <- p + 
  geom_label(data = data.frame(
    case = "Cases",  # only show in this facet
    x = 0.415,
    y = 17.55,
    label = summary_label),
  aes(x = x, y = y, label = label),
  size = 3.25, hjust = 0, vjust = 1,
  fill = "white", color = "black", 
  label.size = 0.4, fontface = "plain",
  inherit.aes = FALSE)

p1
## `geom_smooth()` using formula = 'y ~ x'

###############

group_stats <- combined_lineplot_long_haem_anaemia %>% 
  dplyr::filter(Timepoint=="Enrolment" & case == "Controls") %>% 
  group_by(anaemia_status) %>% 
  summarise(n = sum(!is.na(anaemia_status)),
            prop = n/62*100) %>% 
  ungroup() %>% slice(-5)


summary_label <- paste0(
  "None (n=", group_stats$n[group_stats$anaemia_status == "None"], "):        ", round(group_stats$prop[1], 1),"%" , "\n",
  "Mild (n=", group_stats$n[group_stats$anaemia_status == "Mild"], "):          ", round(group_stats$prop[2], 1), "%" ,  "\n",
  "Moderate (n=", group_stats$n[group_stats$anaemia_status == "Moderate"], "): ", round(group_stats$prop[3], 1),"%", "\n", 
  "Severe (n=", group_stats$n[group_stats$anaemia_status == "Severe"], "):         ",
  round(group_stats$prop[4], 1), "%"
)

p2 <- p1 + 
  geom_label(data = data.frame(
    case = "Controls",  # only show in this facet
    x = 0.425,
    y = 17.55,
    label = summary_label),
  aes(x = x, y = y, label = label),
  size = 3.25, hjust = 0, vjust = 1,
  fill = "white", color = "black", 
  label.size = 0.4, fontface = "plain",
  inherit.aes = FALSE)

p2
## `geom_smooth()` using formula = 'y ~ x'

########################################################################


group_stats <- combined_lineplot_long_haem_anaemia %>% 
  dplyr::filter(Timepoint == "Month 2" & case == "Controls") %>% 
  group_by(anaemia_status) %>% 
  summarise(n = sum(!is.na(values)),
            prop = n/57*100) %>% ungroup() %>% slice(-4)


summary_label <- paste0(
  "None (n=", group_stats$n[group_stats$anaemia_status == "None"], "):        ", round(group_stats$prop[1], 1),"%" , "\n",
  "Mild (n=", group_stats$n[group_stats$anaemia_status == "Mild"], "):          ", round(group_stats$prop[2], 1), "%" ,  "\n",
  "Moderate (n=", group_stats$n[group_stats$anaemia_status == "Moderate"], "): ", round(group_stats$prop[3], 1),"%", "\n", 
  "Severe (n=",0, "):            ",
  0, "%"
)

p3 <- p2 + 
  geom_label(data = data.frame(
    case = "Controls",  # only show in this facet
    x = 1.85,
    y = 17.55,
    label = summary_label),
  aes(x = x, y = y, label = label),
  size = 3.25, hjust = 0, vjust = 1,
  fill = "white", color = "black", 
  label.size = 0.4, fontface = "plain",
  inherit.aes = FALSE)

p3
## `geom_smooth()` using formula = 'y ~ x'

ggsave("../Figures/Lineplots/Haemoglobin_lineplot.pdf", height = 6, width = 10.75)
## `geom_smooth()` using formula = 'y ~ x'

*** MUAC

combined_lineplot_long_haem <- read_csv("../Processed_data/combined_lineplot_long_haematology.csv")
## Rows: 1536 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): case, Timepoint, haematology
## dbl (2): subjid, values
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
combined_lineplot_long_MUAC <- combined_lineplot_long_haem %>%
  dplyr::filter(haematology == "MUAC") 

combined_lineplot_long_MUAC <- combined_lineplot_long_MUAC %>% 
  dplyr::mutate(wasting_status = 
                  case_when(values >= 12.5 ~ "No",
                  values < 11.5 ~ "Severe",
                  values >= 11.5 & values <12.5 ~ "Moderate"))


combined_lineplot_long_MUAC$wasting_status <- 
  factor(combined_lineplot_long_MUAC$wasting_status,
         levels = c("No","Moderate", "Severe"))


p <- ggplot(data = combined_lineplot_long_MUAC, aes(x = Timepoint, y = values)) +
  
  # Add boxplot
  geom_boxplot(alpha = 0.2, outlier.shape = NA, width = 0.3) +

  # Add individual subject lines and points
  geom_line(aes(group = subjid, color = case), size = 0.75, alpha = 0.5) +
  geom_smooth(aes(group=case), method = "loess", se = T, 
              color = "red",alpha = 0.5, size = 1)+
   geom_hline(yintercept = 12.5,color="darkgreen", linewidth=0.7, linetype=2)+
  geom_hline(yintercept = 11.5,color="purple", linewidth=0.7, linetype=2)+
  # Shaded regions
  annotate("rect", xmin = -Inf, xmax = Inf, ymin = 11.5, ymax = 12.5, fill = "grey80", alpha = 0.35) + 
  annotate("rect", xmin = -Inf, xmax = Inf, ymin = 7, ymax = 11.5, fill = "grey20", alpha = 0.3) +
  
  
  geom_point(aes(color = case), size = 0.7) +

  # Color scales
  ggsci::scale_color_nejm() +
  #ggsci::scale_fill_nejm() +

  # Facet by case
  facet_grid(~case, scales = "free_y") +

  # Add p-values
  stat_compare_means(
    comparisons = list(c("Enrolment", "Month 2")),
    method = "wilcox.test",
    paired = TRUE,
    label = "p.format",
    size = 2.8
  ) +

  # Labels and theme
  labs(
    y = "MUAC (cm)",
    x = "Timepoint",
    color = "Survival status",
    fill = "Survival status"
  ) +
  theme_pubr() +
  theme(
    axis.title = element_text(face = "bold", size = 10),
    strip.text = element_text(face = "bold", size = 10),
    axis.text = element_text(face = "bold", size = 10),
    legend.position = "none"
  )

ggsave("../Figures/Lineplots/MUAC_lineplot_Jul2025.pdf", height = 3.75, width = 4)
## `geom_smooth()` using formula = 'y ~ x'
combined_lineplot_long_MUAC$wasting_status <- 
  factor(combined_lineplot_long_MUAC$wasting_status,
         levels = c("No","Moderate", "Severe"))

group_stats <- combined_lineplot_long_MUAC %>% 
  dplyr::filter(Timepoint == "Enrolment" & case == "Cases") %>% 
  group_by(wasting_status) %>% 
  summarise(n = sum(!is.na(values)),
            prop = n/64*100) %>% ungroup()#  %>% slice(-4)

summary_label <- paste0(
  "No (n=",0,       "):              ",
  0,"%" , "\n",
  "Moderate (n=", group_stats$n[group_stats$wasting_status == "Moderate"], "):", round(group_stats$prop[1], 1),"%", "\n", 
    "Severe (n=", group_stats$n[group_stats$wasting_status == "Severe"], "): ", round(group_stats$prop[2], 1),"%")

p1 <- p + geom_label(data = data.frame(
    case = "Cases",  # only show in this facet
    x = 0.415,
    y = 15.55,
    label = summary_label),
  aes(x = x, y = y, label = label),
  size = 3.25, hjust = 0, vjust = 1,
  fill = "white", color = "black", 
  label.size = 0.4, fontface = "plain",
  inherit.aes = FALSE)


group_stats <- combined_lineplot_long_MUAC %>% 
  dplyr::filter(Timepoint == "Month 2" & case == "Cases") %>% 
  group_by(wasting_status) %>% 
  summarise(n = sum(!is.na(values)),
            prop = n/57*100) %>% ungroup() %>% slice(-4)

summary_label <- paste0(
  "No (n=", group_stats$n[group_stats$wasting_status == "No"], "):              ", round(group_stats$prop[1], 1), ".0" , "%" , "\n",
  "Moderate (n=", group_stats$n[group_stats$wasting_status == "Moderate"], "):", round(group_stats$prop[2], 1), "%" ,  "\n",
  "Severe (n=", group_stats$n[group_stats$wasting_status == "Severe"], "):    ", round(group_stats$prop[3], 1),"%")

p2 <- p1 + 
  geom_label(data = data.frame(
    case = "Cases",  # only show in this facet
    x = 1.85,
    y = 15.55,
    label = summary_label),
  aes(x = x, y = y, label = label),
  size = 3.25, hjust = 0, vjust = 1,
  fill = "white", color = "black", 
  label.size = 0.4, fontface = "plain",
  inherit.aes = FALSE)

  

group_stats <- combined_lineplot_long_MUAC %>% 
  dplyr::filter(Timepoint == "Enrolment" & case == "Controls") %>% 
  group_by(wasting_status) %>% 
  summarise(n = sum(!is.na(values)),
            prop = n/64*100) %>% ungroup()# %>% slice(-4)


summary_label <- paste0(
  "No (n=", group_stats$n[group_stats$wasting_status == "No"], "):             ", round(group_stats$prop[1], 1),"%" , "\n",
  "Moderate (n=", group_stats$n[group_stats$wasting_status == "Moderate"],"):  ", round(group_stats$prop[2], 1), "%" ,  "\n",
  "Severe (n=", group_stats$n[group_stats$wasting_status == "Severe"], "):  ", round(group_stats$prop[3], 1),"%"
)

p3 <- p2 + 
  geom_label(data = data.frame(
    case = "Controls",  # only show in this facet
    x = 0.425,
    y = 15.95,
    label = summary_label),
  aes(x = x, y = y, label = label),
  size = 3.25, hjust = 0, vjust = 1,
  fill = "white", color = "black", 
  label.size = 0.4, fontface = "plain",
  inherit.aes = FALSE)

#################################################
group_stats <- combined_lineplot_long_MUAC %>% 
  dplyr::filter(Timepoint == "Month 2" & case == "Controls") %>% 
  group_by(wasting_status) %>% 
  summarise(n = sum(!is.na(values)),
            prop = n/62*100) %>% ungroup() %>% slice(-4)


summary_label <- paste0(
  "No (n=", group_stats$n[group_stats$wasting_status == "No"], "):            ", round(group_stats$prop[1], 1),"%" , "\n",
  "Moderate (n=", group_stats$n[group_stats$wasting_status == "Moderate"],"): ", round(group_stats$prop[2], 1), "%" ,  "\n",
  "Severe (n=", group_stats$n[group_stats$wasting_status == "Severe"], "):    ", round(group_stats$prop[3], 1),"%"
)

p4 <- p3 + 
  geom_label(data = data.frame(
    case = "Controls",  # only show in this facet
    x = 2,
    y = 15.95,
    label = summary_label),
  aes(x = x, y = y, label = label),
  size = 3.25, hjust = 0, vjust = 1,
  fill = "white", color = "black", 
  label.size = 0.4, fontface = "plain",
  inherit.aes = FALSE)

p4
## `geom_smooth()` using formula = 'y ~ x'

ggsave("../Figures/Lineplots/MUAC_lineplot.pdf", height = 3.75, width = 4)
## `geom_smooth()` using formula = 'y ~ x'

Box plots

*** IGFBP6

proteomics_m2 <- read_csv("../Raw_data/Proteomics_rawdata_month2.csv")
## New names:
## Rows: 98 Columns: 590
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (590): record_id, ABO, BCHE, ALDOC, PRG1, VCL, PSAP, C9, TRIM33, DKFZp43...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `SERPINA1` -> `SERPINA1...28`
## • `THBS1` -> `THBS1...45`
## • `F5` -> `F5...81`
## • `TPM3` -> `TPM3...100`
## • `HBG1` -> `HBG1...140`
## • `HBB` -> `HBB...164`
## • `HBD` -> `HBD...236`
## • `IGL@` -> `IGL@...240`
## • `CP` -> `CP...248`
## • `IGF2` -> `IGF2...259`
## • `HBD` -> `HBD...268`
## • `IGF2` -> `IGF2...269`
## • `KNG1` -> `KNG1...307`
## • `FBLN1` -> `FBLN1...313`
## • `KNG1` -> `KNG1...319`
## • `APOB` -> `APOB...344`
## • `CD163` -> `CD163...347`
## • `SAA1` -> `SAA1...354`
## • `CD163` -> `CD163...356`
## • `CP` -> `CP...357`
## • `HBB` -> `HBB...360`
## • `HBG1` -> `HBG1...361`
## • `SERPINA1` -> `SERPINA1...368`
## • `CP` -> `CP...370`
## • `HBA2` -> `HBA2...379`
## • `KRT1` -> `KRT1...385`
## • `KRT1` -> `KRT1...386`
## • `TPM3` -> `TPM3...390`
## • `APOB` -> `APOB...440`
## • `THBS1` -> `THBS1...458`
## • `SAA1` -> `SAA1...469`
## • `FBLN1` -> `FBLN1...491`
## • `IGK@` -> `IGK@...528`
## • `F5` -> `F5...533`
## • `HBB` -> `HBB...538`
## • `IGL@` -> `IGL@...545`
## • `IGL@` -> `IGL@...553`
## • `IGK@` -> `IGK@...554`
## • `HBA2` -> `HBA2...559`
## • `IGL@` -> `IGL@...567`
## • `IGL@` -> `IGL@...568`
## • `HBA2` -> `HBA2...587`
proteomics_m2 <- proteomics_m2 %>% dplyr::rename("subjid"="record_id")

baseline <- read_xlsx("../Raw_data/CTX_case_control_variables_for_James10082023.xlsx")

cases_base <- baseline %>% dplyr::select(subjid, case)

cases_base$subjid <- as.character(cases_base$subjid)

proteomics_m2$subjid <- as.character(proteomics_m2$subjid)

proteomics_M2 <- proteomics_m2

proteomics_M2_bxplot <- proteomics_M2 %>% 
  dplyr::left_join(., cases_base, join_by(subjid))

proteomics_M2_bxplot$TP <- "Month 2"

proteomics <- read_csv("../Raw_data/Proteomics_rawdata_enrol.csv")
## New names:
## Rows: 92 Columns: 590
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (590): record_id, ABO, BCHE, ALDOC, PRG1, VCL, PSAP, C9, TRIM33, DKFZp43...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `SERPINA1` -> `SERPINA1...28`
## • `THBS1` -> `THBS1...45`
## • `F5` -> `F5...81`
## • `TPM3` -> `TPM3...100`
## • `HBG1` -> `HBG1...140`
## • `HBB` -> `HBB...164`
## • `HBD` -> `HBD...236`
## • `IGL@` -> `IGL@...240`
## • `CP` -> `CP...248`
## • `IGF2` -> `IGF2...259`
## • `HBD` -> `HBD...268`
## • `IGF2` -> `IGF2...269`
## • `KNG1` -> `KNG1...307`
## • `FBLN1` -> `FBLN1...313`
## • `KNG1` -> `KNG1...319`
## • `APOB` -> `APOB...344`
## • `CD163` -> `CD163...347`
## • `SAA1` -> `SAA1...354`
## • `CD163` -> `CD163...356`
## • `CP` -> `CP...357`
## • `HBB` -> `HBB...360`
## • `HBG1` -> `HBG1...361`
## • `SERPINA1` -> `SERPINA1...368`
## • `CP` -> `CP...370`
## • `HBA2` -> `HBA2...379`
## • `KRT1` -> `KRT1...385`
## • `KRT1` -> `KRT1...386`
## • `TPM3` -> `TPM3...390`
## • `APOB` -> `APOB...440`
## • `THBS1` -> `THBS1...458`
## • `SAA1` -> `SAA1...469`
## • `FBLN1` -> `FBLN1...491`
## • `IGK@` -> `IGK@...528`
## • `F5` -> `F5...533`
## • `HBB` -> `HBB...538`
## • `IGL@` -> `IGL@...545`
## • `IGL@` -> `IGL@...553`
## • `IGK@` -> `IGK@...554`
## • `HBA2` -> `HBA2...559`
## • `IGL@` -> `IGL@...567`
## • `IGL@` -> `IGL@...568`
## • `HBA2` -> `HBA2...587`
proteomics <- proteomics %>% dplyr::rename("subjid"="record_id")

cases_base$subjid <- as.character(cases_base$subjid)

proteomics$subjid <- as.character(proteomics$subjid)

proteomics_Enr <- proteomics

proteomics_Enr_bxplot <- proteomics_Enr %>% 
  dplyr::left_join(., cases_base, join_by(subjid))

proteomics_Enr_bxplot$TP <- "Enrolment"


proteomics_all <- bind_rows(proteomics_M2_bxplot,proteomics_Enr_bxplot)
proteomics_all <- proteomics_all %>% dplyr::select(subjid,IGFBP6,case,TP)

proteomics_all <- proteomics_all %>% 
  dplyr::mutate(case = case_when(case == "Case" ~ "Cases",
                                 case == "Control" ~ "Controls"))

proteomics_all$case <- factor(proteomics_all$case,
                                levels = c("Cases","Controls"))
pval_df <- data.frame(
  TP = c("Enrolment", "Month 2"),         # Facet variable
  group1 = c("Cases", "Cases"),
  group2 = c("Controls", "Controls"),
  p = c(0.000094, 0.084),
  y.position = c(log10(10), log10(10)),  # Customize Y positions
  label = c("p = 0.000094", "p = 0.084")
)

proteomics_all %>% dplyr::filter(!is.na(case)) %>% 
ggboxplot(x = "case", "IGFBP6", fill = "case", 
          bxp.errorbar = TRUE, 
          size = 0.15,
          width = 0.45,
          bxp.errorbar.width = 0.15,
          #facet.by = "TP"
          ) +
 # geom_point(size = 0.5)) +
  scale_y_log10() +
 stat_pvalue_manual(
    pval_df,
    label = "label",
    label.size = 2.88,
    y.position = "y.position"
  ) +
  labs(y = "IGFBP6 in pg/ml", x = NULL) +
  ggsci::scale_fill_nejm() +
  theme(legend.position = "none",
        strip.text = element_text(size = 12, face = "bold"),
        axis.title = element_text(size = 12, face = "bold"),
        axis.text = element_text(size = 9),
        axis.text.x = element_text(size = 9, face = "bold"))+
facet_wrap(~TP)

#ggsave("../Figures/IGFBP6.jpeg", height = 2.5, width = 2.5)

#ggsave("../Figures/Boxplots/IGFBP6.pdf", height = 3.75, width = 4)
ggsave("../Figures/Boxplots/IGFBP6_boxplot.pdf", height = 3.75, width = 4)

*** ADAMTS13

allplates_4plex <- read_csv("../Raw_data/rawdata_4plex_enrol.csv")
## Rows: 114 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): subjid, Thrombomodulin/BDCA-3, Angiopoietin-2, D-dimer, ADAMTS13
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cases_base$subjid <- as.character(cases_base$subjid)

allplates_4plex$subjid <- as.character(allplates_4plex$subjid)

allplates_4plex_bxplot_ENR <- allplates_4plex %>% 
  dplyr::left_join(., cases_base, join_by(subjid))

allplates_4plex_bxplot_ENR$TP <- "Enrolment"



allplates_4plex_M2 <- read_csv("../Raw_data/rawdata_4plex_m2.csv")
## Rows: 95 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): subjid, ADAMTS13-2, Angiopoietin-2-2, D-dimer-2, Thrombomodulin-2
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cases_base$subjid <- as.character(cases_base$subjid)

allplates_4plex_M2$subjid <- as.character(allplates_4plex_M2$subjid)


allplates_4plex_bxplot_M2 <- allplates_4plex_M2 %>% 
  dplyr::left_join(., cases_base, join_by(subjid))

colnames(allplates_4plex_bxplot_M2) <- gsub("-2$", "", colnames(allplates_4plex_bxplot_M2))

allplates_4plex_bxplot_M2$TP <- "Month 2"


combined4Plex <- bind_rows(allplates_4plex_bxplot_ENR,
                           allplates_4plex_bxplot_M2)

colnames(combined4Plex) <- gsub("-", "", colnames(combined4Plex))

combined4Plex <- combined4Plex %>% 
  dplyr::mutate(case = case_when(case == "Case" ~ "Cases",
                                 case == "Control" ~ "Controls"))

combined4Plex$case <- factor(combined4Plex$case, 
                              levels = c("Cases","Controls"))


write_csv(combined4Plex,"../Processed_data/combined_4plex_boxplot.csv")


pvals <- compare_means(
  ADAMTS13 ~ case,
  group.by = "TP",
  data = combined4Plex,
  method = "wilcox.test"
)
 
# STEP 2: Add custom y-positions for p-value labels per timepoint
pvals <- pvals %>%
  mutate(y.position = ifelse(TP == "Enrolment", log10(1900), log10(1900)))
 
# STEP 3: Plot with facet_wrap and correct p-values

combined4Plex %>% dplyr::filter(!is.na(case)) %>% 
ggboxplot(x = "case", y = "ADAMTS13", fill = "case", 
          bxp.errorbar = TRUE,
          size = 0.15,
          width = 0.45,
          bxp.errorbar.width = 0.15) +
  scale_y_log10() +
  stat_pvalue_manual(
    pvals,
    label = "p.format",
    label.size = 2.88,
    y.position = "y.position"
  ) +
  labs(y = "ADAMTS13 in pg/ml", x = NULL) +
  ggsci::scale_fill_nejm() +
  theme(legend.position = "none",
        strip.text = element_text(size = 12, face = "bold"),
        axis.title = element_text(size = 12, face = "bold"),
        axis.text = element_text(size = 9),
        axis.text.x = element_text(size = 9, face = "bold"))+
  facet_wrap(~TP)
## Warning in (function (mapping = NULL, data = NULL, geom = "boxplot", position =
## "dodge2", : Ignoring unknown aesthetics: fill

#ggsave("../Figures/Boxplots/ADAMTS13.jpeg", height = 2.5, width = 2.5)

#ggsave("../Figures/Boxplots/ADAMTS13.pdf", height = 3.75, width = 4)

ggsave("../Figures/Boxplots/ADAMTS13_boxplot.pdf", height = 3.75, width = 4)
allplates_29plex_M2<- read_csv("../Raw_data/rawdata_29plex_m2.csv")
## Rows: 96 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (30): subjid, EGF-2, Eotaxin-2, G-CSF-2, GM-CSF-2, IFNa2-2, IFNg-2, IL10...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
baseline <- read_xlsx("../Raw_data/CTX_case_control_variables_for_James10082023.xlsx")

cases_base <- baseline %>% dplyr::select(subjid, case)
cases_base$subjid <- as.character(cases_base$subjid)

allplates_29plex_M2$subjid <- as.character(allplates_29plex_M2$subjid)

allplates_29plex_bxplot_M2 <- allplates_29plex_M2 %>% 
  dplyr::left_join(., cases_base, join_by(subjid))

allplates_29plex_bxplot_M2$TP <- "Month 2"

colnames(allplates_29plex_bxplot_M2) <- gsub("-2$", "", colnames(allplates_29plex_bxplot_M2))

colnames(allplates_29plex_bxplot_M2) <- gsub("-", "", colnames(allplates_29plex_bxplot_M2))

allplates_29plex_bxplot_Enr <- read_csv("../Raw_data/rawdata_29plex_enrol.csv")
## Rows: 112 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (30): subjid, IL10, IFNg, IL-12p40, MIP-1a, IL-12p70, MIP-1b, G-CSF, VEG...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
allplates_29plex_bxplot_Enr$subjid <- as.character(allplates_29plex_bxplot_Enr$subjid)

allplates_29plex_bxplot_Enr <- allplates_29plex_bxplot_Enr %>% 
  dplyr::left_join(., cases_base, join_by(subjid))

allplates_29plex_bxplot_Enr$TP <- "Enrolment"

colnames(allplates_29plex_bxplot_Enr) <- gsub("-", "", colnames(allplates_29plex_bxplot_Enr))

combined29Plex <- bind_rows(allplates_29plex_bxplot_Enr, allplates_29plex_bxplot_M2)

combined29Plex <- combined29Plex %>% 
  dplyr::mutate(case = case_when(case == "Case" ~ "Cases",
                                 case == "Control" ~ "Controls"))

combined29Plex$case <- factor(combined29Plex$case, 
                              levels = c("Cases","Controls"))

write_csv(combined29Plex,"../Processed_data/combined_29plex_boxplot.csv")

*** IFNa2

combined29Plex <- read_csv("../Processed_data/combined_29plex_boxplot.csv")
## Rows: 208 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): case, TP
## dbl (30): subjid, IL10, IFNg, IL12p40, MIP1a, IL12p70, MIP1b, GCSF, VEGF, Eo...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
combined29Plex$case <- factor(combined29Plex$case, 
                              levels = c("Cases","Controls"))


pvals <- compare_means(
  IFNa2 ~ case,
  group.by = "TP",
  data = combined29Plex,
  method = "wilcox.test"
)
 
# STEP 2: Add custom y-positions for p-value labels per timepoint
pvals <- pvals %>%
  mutate(y.position = ifelse(TP == "Enrolment", log10(400), log10(1200)))
 
# STEP 3: Plot with facet_wrap and correct p-values

combined29Plex %>% dplyr::filter(!is.na(case)) %>% 
ggboxplot(x = "case", y = "IFNa2", fill = "case", 
          bxp.errorbar = TRUE,
          size = 0.15,
          width = 0.45,
          bxp.errorbar.width = 0.15) +
  scale_y_log10() +
  stat_pvalue_manual(
    pvals,
    label = "p.format",
    label.size = 2.88,
    y.position = "y.position"
  ) +
  labs(y = "IFNa2 in pg/ml", x = NULL) +
  ggsci::scale_fill_nejm() +
  theme(
    legend.position = "none",
    strip.text = element_text(size = 12, face = "bold"),
    axis.title = element_text(size = 12, face = "bold"),
    axis.text = element_text(size = 9),
    axis.text.x = element_text(size = 9, face = "bold")
  ) +
  facet_wrap(~TP)
## Warning in (function (mapping = NULL, data = NULL, geom = "boxplot", position =
## "dodge2", : Ignoring unknown aesthetics: fill

#ggsave("Results/IFNa2.jpeg", height = 2.5, width = 2.5)

#ggsave("../Figures/Boxplots/IFNa2.pdf", height = 3.75, width = 4)

ggsave("../Figures/Boxplots/IFNa2_boxplot.pdf", height = 3.75, width = 4)

*** IL15

combined29Plex <- read_csv("../Processed_data/combined_29plex_boxplot.csv")
## Rows: 208 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): case, TP
## dbl (30): subjid, IL10, IFNg, IL12p40, MIP1a, IL12p70, MIP1b, GCSF, VEGF, Eo...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
combined29Plex$case <- factor(combined29Plex$case, 
                              levels = c("Cases","Controls"))
pvals <- compare_means(
  IL15 ~ case,
  group.by = "TP",
  data = combined29Plex,
  method = "wilcox.test"
)
 
# STEP 2: Add custom y-positions for p-value labels per timepoint
pvals <- pvals %>%
  mutate(y.position = ifelse(TP == "Enrolment", log10(300), log10(350)))
 
# STEP 3: Plot with facet_wrap and correct p-values

combined29Plex %>% dplyr::filter(!is.na(case)) %>% 
ggboxplot(x = "case", y = "IL15", fill = "case", 
          bxp.errorbar = TRUE,
          size = 0.15,
          width = 0.45,
          bxp.errorbar.width = 0.15) +
  scale_y_log10() +
  stat_pvalue_manual(
    pvals,
    label = "p.format",
    label.size = 2.88,
    y.position = "y.position"
  ) +
  labs(y = "IL15 in pg/ml", x = NULL) +
  ggsci::scale_fill_nejm() +
  theme(legend.position = "none",
        strip.text = element_text(size = 12, face = "bold"),
        axis.title = element_text(size = 12, face = "bold"),
        axis.text = element_text(size = 9),
        axis.text.x = element_text(size = 9, face = "bold"))+
  facet_wrap(~TP)
## Warning in (function (mapping = NULL, data = NULL, geom = "boxplot", position =
## "dodge2", : Ignoring unknown aesthetics: fill

#ggsave("../Figures/Boxplots/IL15.jpeg", height = 2.5, width = 2.5)

#ggsave("../Figures/IL15.pdf", height = 3.75, width = 4)

ggsave("../Figures/Boxplots/IL15_boxplot.pdf", height = 3.75, width = 4)

*** IL10

combined29Plex <- read_csv("../Processed_data/combined_29plex_boxplot.csv")
## Rows: 208 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): case, TP
## dbl (30): subjid, IL10, IFNg, IL12p40, MIP1a, IL12p70, MIP1b, GCSF, VEGF, Eo...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
combined29Plex$case <- factor(combined29Plex$case, 
                              levels = c("Cases","Controls"))
pvals <- compare_means(
  IL10 ~ case,
  group.by = "TP",
  data = combined29Plex,
  method = "wilcox.test"
)
 
# STEP 2: Add custom y-positions for p-value labels per timepoint
pvals <- pvals %>%
  mutate(y.position = ifelse(TP == "Enrolment", log10(300), log10(350)))
 
# STEP 3: Plot with facet_wrap and correct p-values

combined29Plex %>% dplyr::filter(!is.na(case)) %>% 
ggboxplot(x = "case", y = "IL10", fill = "case", 
          bxp.errorbar = TRUE,
          size = 0.15,
          width = 0.45,
          bxp.errorbar.width = 0.15) +
  scale_y_log10() +
  stat_pvalue_manual(
    pvals,
    label = "p.format",
    label.size = 2.88,
    y.position = "y.position"
  ) +
  labs(y = "IL10 in pg/ml", x = NULL) +
  ggsci::scale_fill_nejm() +
  theme(legend.position = "none",
        strip.text = element_text(size = 12, face = "bold"),
        axis.title = element_text(size = 12, face = "bold"),
        axis.text = element_text(size = 9),
        axis.text.x = element_text(size = 9, face = "bold"))+
  facet_wrap(~TP)
## Warning in (function (mapping = NULL, data = NULL, geom = "boxplot", position =
## "dodge2", : Ignoring unknown aesthetics: fill

#ggsave("../Figures/Boxplots/IL10.jpeg", height = 3.5, width = 2)

#ggsave("../Figures/Boxplots/IL10.pdf", height = 3.75, width = 4)

ggsave("../Figures/Boxplots/IL10_boxplot.pdf", height = 3.75, width = 4)